full_data <- readRDS('data/full_data.rds')
daily_full <- readRDS('data/daily_full.rds')
Explore some summary statistics
Fixed Monitor stats
base_monitor_stat <- daily_full %>%
group_by(Monitor) %>%
summarise('Closest refinery' = unique(Refinery),
'Distance to nearest refinery' = unique(MinDist),
'Angle to refinery' = unique(Converted_Angle),
'Distance to nearest wrp' = unique(dist_wrp),
'Capacity of nearest wrp' = unique(capacity),
'Angle to wrp' = unique(wrp_angle),
'Distance to Dominguez Channel' = unique(dist_dc),
'Elevation' = unique(elevation),
'Enhanced Vegetation Index' = unique(EVI))
knitr::kable(base_monitor_stat, digits = 2, format = 'html')
|
Monitor
|
Closest refinery
|
Distance to nearest refinery
|
Angle to refinery
|
Distance to nearest wrp
|
Capacity of nearest wrp
|
Angle to wrp
|
Distance to Dominguez Channel
|
Elevation
|
Enhanced Vegetation Index
|
|
213th & Chico
|
Marathon (Carson)
|
2873.05
|
306
|
4297.10
|
400
|
213.09
|
50.00
|
7
|
0.12
|
|
Elm Ave
|
Torrance Refinery
|
1716.47
|
88
|
8414.89
|
400
|
142.20
|
2020.45
|
27
|
0.25
|
|
First Methodist
|
Phillips 66 (Wilmington)
|
2119.19
|
209
|
2456.12
|
400
|
355.45
|
3791.56
|
14
|
0.21
|
|
G Street
|
Phillips 66 (Wilmington)
|
758.68
|
225
|
2940.01
|
400
|
356.20
|
3748.28
|
8
|
0.09
|
|
Guenser Park
|
Torrance Refinery
|
2541.89
|
231
|
7701.97
|
400
|
159.18
|
374.61
|
16
|
0.14
|
|
Harbor Park
|
Phillips 66 (Wilmington)
|
1478.91
|
265
|
2012.17
|
400
|
5.56
|
4262.03
|
12
|
0.60
|
|
Hudson
|
Marathon (Wilmington)
|
1308.28
|
198
|
5919.88
|
400
|
271.99
|
705.23
|
8
|
0.14
|
|
Inner Port
|
Valero Refinery
|
1988.18
|
192
|
5969.55
|
15
|
227.90
|
1936.68
|
5
|
0.04
|
|
Judson
|
Marathon (Carson)
|
2792.00
|
338
|
2692.42
|
400
|
212.89
|
1481.34
|
13
|
0.14
|
|
Manhattan
|
Chevron El Segundo
|
2275.17
|
109
|
5462.08
|
850
|
325.46
|
6144.64
|
42
|
0.19
|
|
North HS
|
Torrance Refinery
|
1371.27
|
296
|
5740.63
|
400
|
130.29
|
4199.05
|
32
|
0.13
|
|
St. Luke
|
Marathon (Carson)
|
2713.63
|
189
|
6910.05
|
400
|
255.89
|
1789.55
|
10
|
0.17
|
|
West HS
|
Torrance Refinery
|
3592.78
|
11
|
9020.27
|
400
|
122.68
|
4869.95
|
31
|
0.17
|
Since Feb 2022
sincefeb2022_stat <- daily_full %>%
filter(day >= '2022-02-01') %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within county' = mean(num_odor_complaints, na.rm=T),
'Active wells 1km' = mean(active_1km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`))
knitr::kable(sincefeb2022_stat, digits = 2, format = 'html')
|
Monitor
|
Daily observations
|
Max Daily Max
|
Max Daily Average
|
Avg Daily Max
|
Avg Daily Average
|
Average Wind Speed
|
Average Wind Direction
|
Average daily odor complaints within county
|
Active wells 1km
|
|
Elm Ave
|
70
|
5.10
|
3.10
|
2.79
|
1.33
|
4.73
|
245.23
|
0.00
|
44.67
|
|
First Methodist
|
282
|
14.72
|
2.28
|
2.56
|
0.81
|
3.18
|
271.33
|
0.06
|
43.39
|
|
G Street
|
282
|
39.18
|
2.43
|
3.89
|
0.93
|
4.20
|
321.32
|
0.06
|
43.39
|
|
Guenser Park
|
278
|
6.86
|
2.66
|
2.04
|
0.94
|
4.27
|
260.90
|
0.01
|
43.41
|
|
Harbor Park
|
280
|
9.65
|
2.48
|
1.62
|
0.41
|
2.88
|
287.43
|
0.06
|
43.41
|
|
Hudson
|
283
|
173.00
|
9.46
|
3.62
|
1.10
|
3.33
|
44.38
|
0.12
|
43.40
|
|
Inner Port
|
282
|
52.63
|
5.36
|
6.48
|
0.91
|
4.19
|
215.54
|
0.05
|
43.40
|
|
Judson
|
283
|
9.52
|
2.39
|
1.80
|
0.51
|
3.63
|
275.45
|
0.40
|
43.40
|
|
Manhattan
|
283
|
5.39
|
1.10
|
0.87
|
0.36
|
3.28
|
226.77
|
0.05
|
43.40
|
|
North HS
|
70
|
4.00
|
1.60
|
1.49
|
0.50
|
4.70
|
243.79
|
0.04
|
44.67
|
|
St. Luke
|
282
|
11.62
|
2.71
|
2.34
|
0.72
|
3.50
|
323.55
|
0.12
|
43.40
|
|
West HS
|
69
|
2.90
|
0.88
|
1.15
|
0.41
|
4.70
|
242.49
|
0.04
|
44.70
|
During Disaster (October 2021 - December 2021)
disaster_stat <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within county' = mean(num_odor_complaints, na.rm=T),
'Active wells 1km' = mean(active_1km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`))
knitr::kable(disaster_stat, digits = 2, format = 'html')
|
Monitor
|
Daily observations
|
Max Daily Max
|
Max Daily Average
|
Avg Daily Max
|
Avg Daily Average
|
Average Wind Speed
|
Average Wind Direction
|
Average daily odor complaints within county
|
Active wells 1km
|
|
213th & Chico
|
47
|
3817.24
|
431.49
|
450.16
|
51.67
|
3.37
|
289.33
|
11.98
|
46.21
|
|
Elm Ave
|
92
|
63.50
|
8.15
|
7.73
|
1.34
|
3.74
|
234.91
|
0.46
|
46.34
|
|
First Methodist
|
56
|
149.47
|
14.11
|
13.80
|
2.07
|
2.67
|
296.88
|
1.86
|
46.34
|
|
G Street
|
56
|
48.67
|
6.61
|
10.66
|
1.95
|
3.66
|
331.85
|
1.86
|
46.34
|
|
Guenser Park
|
56
|
211.67
|
14.49
|
17.98
|
1.91
|
3.47
|
273.88
|
0.41
|
46.34
|
|
Harbor Park
|
56
|
75.93
|
9.37
|
8.49
|
1.20
|
2.36
|
304.86
|
0.55
|
46.34
|
|
Hudson
|
56
|
192.64
|
19.45
|
18.37
|
3.61
|
2.51
|
18.24
|
0.62
|
46.34
|
|
Inner Port
|
56
|
136.50
|
12.21
|
18.26
|
3.54
|
3.45
|
17.38
|
0.09
|
46.34
|
|
Judson
|
56
|
742.25
|
69.08
|
59.11
|
7.57
|
2.81
|
286.73
|
22.30
|
46.34
|
|
Manhattan
|
56
|
17.01
|
3.24
|
3.41
|
1.44
|
2.74
|
199.17
|
0.11
|
46.34
|
|
North HS
|
92
|
98.50
|
7.88
|
6.94
|
1.14
|
3.74
|
235.24
|
0.53
|
46.34
|
|
St. Luke
|
56
|
119.72
|
12.26
|
10.67
|
2.52
|
2.99
|
0.44
|
0.62
|
46.34
|
|
West HS
|
92
|
41.50
|
4.89
|
3.50
|
0.76
|
3.74
|
234.89
|
0.53
|
46.34
|
Normal Period (Jan 2020- May 2023) excluding disaster
normal_stat <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
group_by(Monitor) %>%
summarise('Daily observations' = n(),
'Max Daily Max' = max(H2S_daily_max, na.rm=T),
'Max Daily Average' = max(H2S_daily_avg, na.rm=T),
'Avg Daily Max' = mean(H2S_daily_max, na.rm=T),
'Avg Daily Average' = mean(H2S_daily_avg, na.rm=T),
'Average Wind Speed' = mean(ws_avg, na.rm=T),
'Average Wind Direction' = as.numeric(mean(circular(wd_avg,
units = 'degrees'),
na.rm=T)),
'Average daily odor complaints within county' = mean(num_odor_complaints, na.rm=T),
'Active wells 1km' = mean(active_1km, na.rm=T)) %>%
mutate('Average Wind Direction' = if_else(`Average Wind Direction` < 0,
`Average Wind Direction`+360,
`Average Wind Direction`))
knitr::kable(normal_stat, digits = 2, format = 'html')
|
Monitor
|
Daily observations
|
Max Daily Max
|
Max Daily Average
|
Avg Daily Max
|
Avg Daily Average
|
Average Wind Speed
|
Average Wind Direction
|
Average daily odor complaints within county
|
Active wells 1km
|
|
213th & Chico
|
16
|
14.27
|
3.92
|
4.82
|
2.99
|
2.50
|
304.22
|
2.69
|
45.00
|
|
Elm Ave
|
731
|
11.40
|
3.20
|
2.19
|
0.97
|
4.53
|
241.58
|
0.02
|
48.59
|
|
First Methodist
|
462
|
17.58
|
2.81
|
2.65
|
0.83
|
3.24
|
274.30
|
0.13
|
45.19
|
|
G Street
|
460
|
382.76
|
18.66
|
4.57
|
0.91
|
4.27
|
323.56
|
0.10
|
45.19
|
|
Guenser Park
|
457
|
6.86
|
2.66
|
1.73
|
0.71
|
4.47
|
262.07
|
0.01
|
45.21
|
|
Harbor Park
|
460
|
10.34
|
2.67
|
1.71
|
0.45
|
2.91
|
292.68
|
0.06
|
45.21
|
|
Hudson
|
460
|
173.00
|
9.46
|
3.40
|
1.15
|
3.36
|
25.30
|
0.10
|
45.19
|
|
Inner Port
|
462
|
52.63
|
5.36
|
5.80
|
0.83
|
4.36
|
224.57
|
0.05
|
45.20
|
|
Judson
|
457
|
15.78
|
3.55
|
1.98
|
0.64
|
3.70
|
277.60
|
0.41
|
45.17
|
|
Manhattan
|
387
|
5.39
|
1.58
|
1.05
|
0.47
|
3.12
|
231.20
|
0.10
|
44.27
|
|
North HS
|
740
|
30.60
|
2.71
|
1.94
|
1.03
|
4.53
|
241.27
|
0.04
|
48.60
|
|
St. Luke
|
462
|
11.62
|
2.91
|
2.25
|
0.70
|
3.60
|
317.83
|
0.10
|
45.20
|
|
West HS
|
737
|
18.50
|
3.01
|
1.55
|
0.58
|
4.53
|
241.00
|
0.04
|
48.60
|
Explore some GAM models
Five minute H2S
h2s_model_a <- gam(H2S~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.172e+00 3.213e-03 364.72 <2e-16 ***
## year2023 -3.263e-01 2.800e-03 -116.54 <2e-16 ***
## wd_secQ2 -2.560e-01 3.751e-03 -68.23 <2e-16 ***
## wd_secQ3 -2.898e-01 3.892e-03 -74.46 <2e-16 ***
## wd_secQ4 -2.095e-01 3.492e-03 -59.99 <2e-16 ***
## ws -5.424e-02 4.176e-04 -129.91 <2e-16 ***
## I(1/MinDist^2) 2.114e+05 2.329e+03 90.77 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.996 8 2496 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0946 Deviance explained = 9.47%
## GCV = 0.92965 Scale est. = 0.92963 n = 772251
plot(h2s_model_a)

gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4657067 248.8 8132744 434.4 7275617 388.6
## Vcells 104430763 796.8 270449874 2063.4 224923882 1716.1
h2s_model_b <- gam(H2S~s(as.numeric(month),bs='cc')+wd_sec+ws+I(1/MinDist^2)+
s(mon_utm_x, mon_utm_y, bs='tp', k = 8),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + wd_sec + ws + I(1/MinDist^2) +
## s(mon_utm_x, mon_utm_y, bs = "tp", k = 8)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.013e+00 3.170e-03 319.43 <2e-16 ***
## wd_secQ2 -1.953e-01 3.695e-03 -52.86 <2e-16 ***
## wd_secQ3 -1.816e-01 3.905e-03 -46.51 <2e-16 ***
## wd_secQ4 -1.131e-01 3.486e-03 -32.43 <2e-16 ***
## ws -6.919e-02 4.146e-04 -166.88 <2e-16 ***
## I(1/MinDist^2) 2.983e+05 2.856e+03 104.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.999 8 1710 <2e-16 ***
## s(mon_utm_x,mon_utm_y) 6.999 7 6572 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.131 Deviance explained = 13.1%
## GCV = 0.89282 Scale est. = 0.8928 n = 772251
plot(h2s_model_b)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4669994 249.5 8132744 434.4 7275617 388.6
## Vcells 114946073 877.0 324619848 2476.7 317696763 2423.9
# Include converted angle and weekday
h2s_model_c <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + wd_sec + ws +
I(1/MinDist^2) + Converted_Angle +
s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
as.factor(weekday),
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + wd_sec + ws +
## I(1/MinDist^2) + Converted_Angle + s(mon_utm_x, mon_utm_y,
## bs = "tp", k = 10) + as.factor(weekday)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.494e+00 2.321e-02 107.466 < 2e-16 ***
## year2023 -3.331e-01 2.866e-03 -116.237 < 2e-16 ***
## wd_secQ2 -1.863e-01 3.645e-03 -51.123 < 2e-16 ***
## wd_secQ3 -1.914e-01 3.848e-03 -49.753 < 2e-16 ***
## wd_secQ4 -1.096e-01 3.443e-03 -31.842 < 2e-16 ***
## ws -6.123e-02 4.080e-04 -150.068 < 2e-16 ***
## I(1/MinDist^2) -3.291e-05 3.304e-07 -99.583 < 2e-16 ***
## Converted_Angle -6.125e-03 1.104e-04 -55.496 < 2e-16 ***
## as.factor(weekday).L 9.861e-02 2.802e-03 35.190 < 2e-16 ***
## as.factor(weekday).Q -1.699e-01 2.810e-03 -60.461 < 2e-16 ***
## as.factor(weekday).C -3.825e-03 2.805e-03 -1.364 0.173
## as.factor(weekday)^4 -2.049e-02 2.815e-03 -7.277 3.41e-13 ***
## as.factor(weekday)^5 -5.801e-02 2.809e-03 -20.654 < 2e-16 ***
## as.factor(weekday)^6 -2.508e-02 2.821e-03 -8.889 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.998 8 2704 <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.999 9 6486 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 30/31
## R-sq.(adj) = 0.155 Deviance explained = 15.6%
## GCV = 0.86717 Scale est. = 0.86714 n = 772251
plot(h2s_model_c)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4671696 249.5 8132744 434.4 7275617 388.6
## Vcells 126969342 968.7 389623817 2972.6 379647656 2896.5
# Include converted angle and weekday
h2s_model_d <- gam(H2S ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_sec + ws + downwind_ref + downwind_wrp +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = full_data %>% filter(day >= '2022-02-01'))
summary(h2s_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## dist_wrp + capacity + Converted_Angle + s(mon_utm_x, mon_utm_y,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.598e+00 1.076e-01 -24.142 < 2e-16 ***
## year2023 -1.204e-01 6.381e-03 -18.862 < 2e-16 ***
## as.factor(weekday).L 1.056e-01 2.772e-03 38.089 < 2e-16 ***
## as.factor(weekday).Q -1.728e-01 2.779e-03 -62.180 < 2e-16 ***
## as.factor(weekday).C -6.411e-03 2.773e-03 -2.312 0.0208 *
## as.factor(weekday)^4 -1.711e-02 2.783e-03 -6.146 7.94e-10 ***
## as.factor(weekday)^5 -5.627e-02 2.776e-03 -20.268 < 2e-16 ***
## as.factor(weekday)^6 -2.645e-02 2.788e-03 -9.486 < 2e-16 ***
## wd_secQ2 -1.503e-01 3.698e-03 -40.648 < 2e-16 ***
## wd_secQ3 -1.289e-01 3.923e-03 -32.865 < 2e-16 ***
## wd_secQ4 -7.449e-02 3.472e-03 -21.453 < 2e-16 ***
## ws -6.841e-02 4.074e-04 -167.932 < 2e-16 ***
## downwind_ref 1.374e-01 3.901e-03 35.229 < 2e-16 ***
## downwind_wrp -1.702e-02 4.304e-03 -3.954 7.69e-05 ***
## I(1/MinDist^2) 2.788e-05 1.646e-06 16.943 < 2e-16 ***
## dist_wrp 3.799e-04 8.572e-06 44.322 < 2e-16 ***
## capacity 6.154e-03 8.181e-05 75.220 < 2e-16 ***
## Converted_Angle -2.943e-03 1.099e-04 -26.770 < 2e-16 ***
## monthly_oil_1km 6.223e-05 2.765e-06 22.506 < 2e-16 ***
## monthly_gas_1km 1.292e-04 1.243e-05 10.390 < 2e-16 ***
## active_1km -2.572e-02 1.946e-03 -13.218 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.989 8.000 2000 <2e-16 ***
## s(mon_utm_x,mon_utm_y) 8.008 8.008 5964 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 36/38
## R-sq.(adj) = 0.175 Deviance explained = 17.6%
## GCV = 0.84667 Scale est. = 0.84663 n = 772251
plot(h2s_model_d)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4672591 249.6 8132744 434.4 7275617 388.6
## Vcells 143240372 1092.9 467628580 3567.8 389378290 2970.8
# Include elvation, EVI, 3D smooth, odor, dist to dom channel, temp, hum, precip
h2s_model_e <- readRDS('rfiles/h2s_model_e.rds')
# h2s_model_e <- gam(H2S ~
# s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
# wd_sec + ws + downwind_ref + downwind_wrp +
# I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
# Converted_Angle + elevation + EVI +
# s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
# te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
# monthly_oil_1km + monthly_gas_1km + active_1km +
# num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
# ,
# data = full_data %>% filter(day >= '2022-02-01'))
#saveRDS(h2s_model_e, 'rfiles/h2s_model_e.rds')
summary(h2s_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.285e+00 6.510e-01 3.510 0.000449 ***
## year2023 9.870e-02 1.712e-02 5.766 8.12e-09 ***
## as.factor(weekday).L 1.129e-01 2.852e-03 39.594 < 2e-16 ***
## as.factor(weekday).Q -1.747e-01 2.841e-03 -61.484 < 2e-16 ***
## as.factor(weekday).C 1.584e-02 2.824e-03 5.609 2.03e-08 ***
## as.factor(weekday)^4 -1.810e-02 2.828e-03 -6.400 1.55e-10 ***
## as.factor(weekday)^5 -5.583e-02 2.808e-03 -19.883 < 2e-16 ***
## as.factor(weekday)^6 -2.485e-02 2.822e-03 -8.806 < 2e-16 ***
## wd_secQ2 -1.031e-01 3.741e-03 -27.569 < 2e-16 ***
## wd_secQ3 -1.454e-01 3.962e-03 -36.702 < 2e-16 ***
## wd_secQ4 -1.146e-01 3.490e-03 -32.841 < 2e-16 ***
## ws -6.036e-02 4.158e-04 -145.169 < 2e-16 ***
## downwind_ref 1.098e-01 3.923e-03 28.001 < 2e-16 ***
## downwind_wrp 8.079e-03 4.326e-03 1.868 0.061802 .
## I(1/MinDist^2) 1.707e-05 6.264e-06 2.726 0.006420 **
## I(1/dist_wrp^2) 1.056e-06 3.794e-07 2.783 0.005394 **
## capacity 7.434e-03 1.338e-03 5.556 2.75e-08 ***
## wrp_angle 4.063e-03 9.670e-04 4.201 2.65e-05 ***
## Converted_Angle -1.586e-02 2.157e-03 -7.352 1.95e-13 ***
## elevation -2.059e-01 1.293e-02 -15.920 < 2e-16 ***
## EVI 2.975e+00 6.113e-01 4.868 1.13e-06 ***
## monthly_oil_1km 1.856e-04 6.710e-06 27.663 < 2e-16 ***
## monthly_gas_1km -4.097e-04 3.277e-05 -12.504 < 2e-16 ***
## active_1km -3.057e-02 2.530e-03 -12.081 < 2e-16 ***
## num_odor_complaints 2.821e-02 1.917e-03 14.718 < 2e-16 ***
## I(1/dist_dc^2) 1.019e-04 9.015e-06 11.303 < 2e-16 ***
## avg_temp 9.080e-03 4.090e-04 22.202 < 2e-16 ***
## avg_hum -7.505e-03 1.056e-04 -71.073 < 2e-16 ***
## precip -8.285e-02 5.145e-03 -16.102 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.994 8.000 310.126
## s(mon_utm_x,mon_utm_y) 1.865 1.865 0.459
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 81.249 81.253 899.338
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(mon_utm_x,mon_utm_y) 0.542
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/135
## R-sq.(adj) = 0.255 Deviance explained = 25.5%
## GCV = 0.80029 Scale est. = 0.80016 n = 712259
plot(h2s_model_e)



gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4680354 250.0 8132744 434.4 7275617 388.6
## Vcells 181371461 1383.8 467628580 3567.8 389378290 2970.8
# Model only the month of the event, October 2021
h2s_model_f <- readRDS('rfiles/h2s_model_f.rds')
# h2s_model_f <- gam(H2S ~ wd_sec + ws + downwind_ref + downwind_wrp +
# I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
# Converted_Angle + elevation + EVI +
# s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
# te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
# monthly_oil_1km + monthly_gas_1km + active_1km +
# num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip,
# data = full_data %>% filter(year == '2021', month == '10'))
# saveRDS(h2s_model_f, 'rfiles/h2s_model_f.rds')
summary(h2s_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.099e-07 4.542e-07 2.003 0.04516 *
## wd_secQ2 3.129e+00 9.512e-01 3.289 0.00101 **
## wd_secQ3 6.611e+00 9.439e-01 7.004 2.50e-12 ***
## wd_secQ4 7.882e+00 8.910e-01 8.845 < 2e-16 ***
## ws -2.321e+00 1.159e-01 -20.026 < 2e-16 ***
## downwind_ref 1.639e+00 8.589e-01 1.908 0.05642 .
## downwind_wrp 1.494e+00 1.077e+00 1.388 0.16529
## I(1/MinDist^2) -4.561e-03 1.475e-03 -3.093 0.00198 **
## I(1/dist_wrp^2) -9.325e-04 5.499e-05 -16.959 < 2e-16 ***
## capacity 7.671e-01 1.692e-01 4.533 5.82e-06 ***
## wrp_angle 8.449e-01 1.825e-01 4.629 3.68e-06 ***
## Converted_Angle -2.556e+00 3.095e-01 -8.258 < 2e-16 ***
## elevation -2.652e+01 2.679e+00 -9.899 < 2e-16 ***
## EVI 8.337e+02 1.155e+02 7.220 5.23e-13 ***
## monthly_oil_1km 1.533e-02 7.959e-03 1.926 0.05412 .
## monthly_gas_1km 2.295e-03 1.205e-03 1.905 0.05680 .
## active_1km 3.981e-05 2.000e-05 1.991 0.04647 *
## num_odor_complaints 8.458e-01 6.767e-02 12.498 < 2e-16 ***
## I(1/dist_dc^2) 7.748e+00 3.991e-01 19.415 < 2e-16 ***
## avg_temp 1.349e+00 1.864e-01 7.236 4.66e-13 ***
## avg_hum 1.629e-01 5.258e-02 3.099 0.00194 **
## precip -4.019e+01 7.628e+00 -5.269 1.37e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(mon_utm_x,mon_utm_y) 2.017 2.02 4.625
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 84.337 84.58 61.960
## p-value
## s(mon_utm_x,mon_utm_y) 0.00868 **
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 108/120
## R-sq.(adj) = 0.265 Deviance explained = 26.5%
## GCV = 5448.6 Scale est. = 5441.4 n = 77510
plot(h2s_model_f)


gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4681877 250.1 8132744 434.4 7275617 388.6
## Vcells 185357855 1414.2 467628580 3567.8 389378290 2970.8
# Model data excluding the months of the event
h2s_model_g <- readRDS('rfiles/h2s_model_g.rds')
# h2s_model_g <- gam(H2S ~ s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
# wd_sec + ws + downwind_ref + downwind_wrp +
# I(1/MinDist^2) + I(1/dist_wrp^2) + capacity + wrp_angle +
# Converted_Angle + elevation + EVI +
# s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
# te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
# monthly_oil_1km + monthly_gas_1km + active_1km +
# num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum + precip
# ,
# data = full_data %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
# saveRDS(h2s_model_g, 'rfiles/h2s_model_g.rds')
summary(h2s_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_sec + ws + downwind_ref + downwind_wrp + I(1/MinDist^2) +
## I(1/dist_wrp^2) + capacity + wrp_angle + Converted_Angle +
## elevation + EVI + s(mon_utm_x, mon_utm_y, bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.539e+00 3.610e-01 15.343 < 2e-16 ***
## year2021 3.707e-01 1.375e-02 26.967 < 2e-16 ***
## year2022 -5.225e-01 2.573e-02 -20.307 < 2e-16 ***
## year2023 -6.492e-01 2.547e-02 -25.492 < 2e-16 ***
## as.factor(weekday).L 8.394e-02 2.184e-03 38.426 < 2e-16 ***
## as.factor(weekday).Q -1.356e-01 2.182e-03 -62.151 < 2e-16 ***
## as.factor(weekday).C 1.406e-03 2.184e-03 0.644 0.5197
## as.factor(weekday)^4 -2.889e-02 2.188e-03 -13.204 < 2e-16 ***
## as.factor(weekday)^5 -1.732e-02 2.181e-03 -7.943 1.98e-15 ***
## as.factor(weekday)^6 -5.050e-03 2.184e-03 -2.312 0.0208 *
## wd_secQ2 -1.049e-01 2.952e-03 -35.543 < 2e-16 ***
## wd_secQ3 -1.878e-01 2.958e-03 -63.463 < 2e-16 ***
## wd_secQ4 -1.651e-01 2.796e-03 -59.057 < 2e-16 ***
## ws -4.613e-02 3.094e-04 -149.116 < 2e-16 ***
## downwind_ref 3.540e-02 2.614e-03 13.545 < 2e-16 ***
## downwind_wrp 2.151e-02 3.270e-03 6.577 4.80e-11 ***
## I(1/MinDist^2) -4.864e-05 8.792e-06 -5.533 3.15e-08 ***
## I(1/dist_wrp^2) 1.483e-06 9.456e-07 1.568 0.1168
## capacity 5.414e-03 5.868e-04 9.225 < 2e-16 ***
## wrp_angle 3.176e-03 7.460e-04 4.258 2.06e-05 ***
## Converted_Angle -1.473e-02 1.020e-03 -14.451 < 2e-16 ***
## elevation -1.200e-01 9.824e-03 -12.215 < 2e-16 ***
## EVI 2.459e+00 4.676e-01 5.259 1.44e-07 ***
## monthly_oil_1km -2.575e-05 2.232e-06 -11.534 < 2e-16 ***
## monthly_gas_1km -1.831e-04 9.137e-06 -20.036 < 2e-16 ***
## active_1km -3.881e-02 9.430e-04 -41.155 < 2e-16 ***
## num_odor_complaints 2.632e-02 1.466e-03 17.949 < 2e-16 ***
## I(1/dist_dc^2) 2.766e-03 3.971e-03 0.696 0.4861
## avg_temp 6.183e-03 2.696e-04 22.937 < 2e-16 ***
## avg_hum -6.301e-03 7.620e-05 -82.688 < 2e-16 ***
## precip -7.811e-02 5.252e-03 -14.872 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.999 8.000 2547.0
## s(mon_utm_x,mon_utm_y) 1.682 1.683 246.9
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 85.198 85.296 1455.0
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(mon_utm_x,mon_utm_y) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 129/137
## R-sq.(adj) = 0.166 Deviance explained = 16.6%
## GCV = 1.1485 Scale est. = 1.1484 n = 1696814
plot(h2s_model_g)



gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4684353 250.2 8132744 434.4 7275617 388.6
## Vcells 275649503 2103.1 467628580 3567.8 389378290 2970.8
Daily max H2S
# With month, year, wind sector/speed, dist to refinery, refinery
h2s_dm_model_a <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+year+wd_avg+ws_avg+
I(1/MinDist^2),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + wd_avg +
## ws_avg + I(1/MinDist^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.031e+00 3.477e-01 8.717 <2e-16 ***
## year2023 -4.346e-01 2.197e-01 -1.978 0.048 *
## wd_avg 6.613e-04 1.035e-03 0.639 0.523
## ws_avg -8.840e-02 6.339e-02 -1.394 0.163
## I(1/MinDist^2) 7.658e-07 8.232e-08 9.303 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.184 8 2.026 6.47e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 12/13
## R-sq.(adj) = 0.00874 Deviance explained = 1.07%
## GCV = 21.457 Scale est. = 21.407 n = 2664
plot(h2s_dm_model_a)

# Also with location of monitor
h2s_dm_model_b <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Refinery+s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Refinery + s(monitor_lat, monitor_lon, bs = "tp",
## k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.418e+00 1.420e+01 0.663 0.50715
## wd_avg 3.025e-03 1.010e-03 2.994 0.00278 **
## ws_avg -3.111e-01 6.333e-02 -4.913 9.54e-07 ***
## I(1/MinDist^2) -6.123e-05 1.635e-03 -0.037 0.97012
## RefineryMarathon (Carson) -9.065e-01 1.761e+01 -0.051 0.95894
## RefineryMarathon (Wilmington) -3.737e+00 1.745e+01 -0.214 0.83047
## RefineryPhillips 66 (Wilmington) -1.409e+01 1.626e+01 -0.866 0.38643
## RefineryTorrance Refinery -2.578e+00 1.400e+01 -0.184 0.85387
## RefineryValero Refinery -7.568e+00 1.766e+01 -0.429 0.66824
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.035 8.000 1.723 0.000203 ***
## s(monitor_lat,monitor_lon) 5.379 5.672 9.312 1.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 25/26
## R-sq.(adj) = 0.127 Deviance explained = 13.2%
## GCV = 18.968 Scale est. = 18.858 n = 2664
plot(h2s_dm_model_b)


# Also include angle to refinery
h2s_dm_model_c <- gam(H2S_daily_max~s(as.numeric(month),bs='cc')+wd_avg+ws_avg+
I(1/MinDist^2)+Converted_Angle+
s(monitor_lat, monitor_lon, bs='tp', k=10),
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + wd_avg + ws_avg +
## I(1/MinDist^2) + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.608e+00 8.828e-01 4.087 4.5e-05 ***
## wd_avg 2.924e-03 1.015e-03 2.882 0.003983 **
## ws_avg -2.235e-01 6.205e-02 -3.602 0.000322 ***
## I(1/MinDist^2) -8.626e-06 3.814e-05 -0.226 0.821082
## Converted_Angle -3.444e-03 3.892e-03 -0.885 0.376317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.113 8.000 2.042 4.88e-05 ***
## s(monitor_lat,monitor_lon) 7.071 7.919 41.137 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 21/22
## R-sq.(adj) = 0.117 Deviance explained = 12.1%
## GCV = 19.169 Scale est. = 19.074 n = 2664
plot(h2s_dm_model_c)


h2s_dm_model_d <- gam(H2S_daily_max ~
s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + dist_wrp + capacity +
Converted_Angle +
s(monitor_lat, monitor_lon, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km
,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + dist_wrp +
## capacity + Converted_Angle + s(monitor_lat, monitor_lon,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.696e+01 5.143e+00 -3.298 0.000986 ***
## year2023 -6.419e-01 4.082e-01 -1.573 0.115927
## as.factor(weekday).L 4.659e-01 2.216e-01 2.103 0.035589 *
## as.factor(weekday).Q -9.062e-01 2.223e-01 -4.077 4.7e-05 ***
## as.factor(weekday).C 8.727e-03 2.215e-01 0.039 0.968577
## as.factor(weekday)^4 -1.728e-01 2.223e-01 -0.777 0.437033
## as.factor(weekday)^5 -2.848e-01 2.217e-01 -1.285 0.198943
## as.factor(weekday)^6 -1.696e-01 2.223e-01 -0.763 0.445474
## wd_avg 3.371e-03 1.028e-03 3.279 0.001057 **
## ws_avg -3.041e-01 6.608e-02 -4.601 4.4e-06 ***
## daily_downwind_ref 7.377e-01 3.989e-01 1.849 0.064527 .
## I(1/MinDist^2) 2.246e-04 3.883e-04 0.578 0.563013
## dist_wrp 2.258e-03 6.614e-04 3.415 0.000648 ***
## capacity 9.466e-03 1.166e-02 0.812 0.416776
## Converted_Angle 9.685e-03 8.699e-03 1.113 0.265657
## monthly_oil_1km 2.479e-04 1.548e-04 1.602 0.109297
## monthly_gas_1km -1.138e-03 6.554e-04 -1.736 0.082603 .
## active_1km 3.974e-02 6.286e-02 0.632 0.527264
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 2.319 8.000 2.087 6.51e-05 ***
## s(monitor_lat,monitor_lon) 7.785 7.976 14.036 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 34/35
## R-sq.(adj) = 0.137 Deviance explained = 14.5%
## GCV = 18.829 Scale est. = 18.638 n = 2664
plot(h2s_dm_model_d)


# Try to include as many variables as possible
h2s_dm_model_e <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
capacity + daily_downwind_wrp +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_dm_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km +
## elevation + EVI + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.184e+01 7.409e+00 -1.598 0.11020
## year2023 2.174e-01 1.024e+00 0.212 0.83183
## as.character(weekday)Mon -6.411e-01 3.101e-01 -2.067 0.03882 *
## as.character(weekday)Sat -7.266e-01 3.099e-01 -2.344 0.01913 *
## as.character(weekday)Sun -1.279e+00 3.102e-01 -4.123 3.86e-05 ***
## as.character(weekday)Thu -2.864e-01 3.102e-01 -0.923 0.35608
## as.character(weekday)Tue -1.762e-01 3.078e-01 -0.572 0.56720
## as.character(weekday)Wed -4.809e-02 3.118e-01 -0.154 0.87745
## wd_avg 2.344e-03 1.084e-03 2.161 0.03078 *
## ws_avg -2.294e-01 7.129e-02 -3.217 0.00131 **
## daily_downwind_ref 4.051e-01 3.944e-01 1.027 0.30441
## I(1/dist_wrp^2) 4.422e-07 1.655e-06 0.267 0.78930
## capacity -5.974e-03 2.648e-03 -2.256 0.02415 *
## daily_downwind_wrp 1.783e-01 4.040e-01 0.441 0.65903
## monthly_oil_1km 5.216e-04 2.741e-04 1.903 0.05714 .
## monthly_gas_1km -1.588e-03 1.987e-03 -0.799 0.42419
## active_1km 2.959e-01 1.237e-01 2.393 0.01680 *
## elevation -1.318e-01 5.729e-02 -2.301 0.02146 *
## EVI -3.691e+00 7.407e-01 -4.983 6.68e-07 ***
## num_odor_complaints 1.128e-01 1.520e-01 0.742 0.45800
## I(1/dist_dc^2) 3.870e-05 7.395e-05 0.523 0.60085
## avg_temp 6.252e-02 2.837e-02 2.204 0.02763 *
## avg_hum -1.857e-02 7.398e-03 -2.511 0.01212 *
## precip 4.781e-01 4.334e-01 1.103 0.27009
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 3.379e-09 8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 4.037e+00 4.451 3.041
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 5.550e+01 76.000 1.469
## p-value
## s(as.numeric(month)) 0.85254
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 0.00945 **
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 3.23e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 115/117
## R-sq.(adj) = 0.161 Deviance explained = 18.7%
## GCV = 18.681 Scale est. = 18.109 n = 2664
plot(h2s_dm_model_e)



e <- getViz(h2s_dm_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_f <- gam(H2S_daily_max ~ as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
capacity + daily_downwind_wrp +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))
summary(h2s_dm_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ as.character(weekday) + wd_avg + ws_avg + daily_downwind_ref +
## I(1/dist_wrp^2) + capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km +
## active_1km + elevation + EVI + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.933615 1.112132 -2.638 0.00852 **
## as.character(weekday)Mon 3.939302 25.809815 0.153 0.87873
## as.character(weekday)Sat -1.274518 24.967489 -0.051 0.95930
## as.character(weekday)Sun 6.518458 25.156102 0.259 0.79561
## as.character(weekday)Thu 10.719203 25.476323 0.421 0.67406
## as.character(weekday)Tue -12.039497 26.266684 -0.458 0.64683
## as.character(weekday)Wed 9.587227 25.907187 0.370 0.71144
## wd_avg 0.079809 0.082575 0.966 0.33411
## ws_avg 0.102886 6.616534 0.016 0.98760
## daily_downwind_ref -30.773112 24.395106 -1.261 0.20754
## I(1/dist_wrp^2) -0.001388 0.000147 -9.442 < 2e-16 ***
## capacity 6.856419 0.710540 9.650 < 2e-16 ***
## daily_downwind_wrp 8.299696 29.553392 0.281 0.77891
## monthly_oil_1km 0.053059 0.115651 0.459 0.64652
## monthly_gas_1km -0.001199 0.314860 -0.004 0.99696
## active_1km -96.629305 36.632200 -2.638 0.00852 **
## elevation 24.515412 7.897485 3.104 0.00198 **
## EVI 804.034316 104.569368 7.689 4.69e-14 ***
## num_odor_complaints 3.663308 0.869852 4.211 2.85e-05 ***
## I(1/dist_dc^2) 9.401428 0.971411 9.678 < 2e-16 ***
## avg_temp 2.543734 2.454075 1.037 0.30029
## avg_hum 0.217984 0.586008 0.372 0.71001
## precip -6.489537 26.367207 -0.246 0.80566
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.929 8.99 12.175
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.052 80.00 4.366
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 109/112
## R-sq.(adj) = 0.511 Deviance explained = 55.8%
## GCV = 37085 Scale est. = 33454 n = 827
plot(h2s_dm_model_f)


f <- getViz(h2s_dm_model_f)
plot(sm(f, 1)) + l_fitRaster() + l_fitContour() + l_points()

h2s_dm_model_g <- gam(H2S_daily_max ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
capacity + daily_downwind_wrp +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
summary(h2s_dm_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_max ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
## capacity + daily_downwind_wrp + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## as.numeric(day), k = c(10, 10), d = c(2, 1), bs = c("tp",
## "cc")) + monthly_oil_1km + monthly_gas_1km + active_1km +
## elevation + EVI + num_odor_complaints + I(1/dist_dc^2) +
## avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.916e+00 5.515e+00 0.529 0.597053
## year2021 8.579e-01 9.300e-01 0.922 0.356332
## year2022 -2.466e-01 1.422e+00 -0.173 0.862369
## year2023 -5.194e-01 1.452e+00 -0.358 0.720584
## as.character(weekday)Mon -3.061e-01 2.768e-01 -1.106 0.268828
## as.character(weekday)Sat -5.006e-01 2.749e-01 -1.821 0.068640 .
## as.character(weekday)Sun -8.683e-01 2.753e-01 -3.153 0.001622 **
## as.character(weekday)Thu -4.797e-02 2.758e-01 -0.174 0.861931
## as.character(weekday)Tue 2.499e-01 2.750e-01 0.909 0.363486
## as.character(weekday)Wed 4.445e-02 2.770e-01 0.160 0.872503
## wd_avg 1.333e-03 1.003e-03 1.329 0.183975
## ws_avg -2.430e-01 6.243e-02 -3.893 0.000100 ***
## daily_downwind_ref 1.440e-01 2.800e-01 0.514 0.607101
## I(1/dist_wrp^2) -3.044e-06 6.687e-07 -4.553 5.40e-06 ***
## capacity 3.387e-04 6.443e-03 0.053 0.958080
## daily_downwind_wrp -5.676e-03 3.347e-01 -0.017 0.986471
## monthly_oil_1km 9.574e-05 1.675e-04 0.572 0.567570
## monthly_gas_1km 1.013e-04 6.476e-04 0.156 0.875724
## active_1km 1.156e-01 5.775e-02 2.002 0.045331 *
## elevation -2.426e-01 6.877e-02 -3.528 0.000421 ***
## EVI -4.221e+00 1.023e+00 -4.125 3.76e-05 ***
## num_odor_complaints 3.987e-01 1.295e-01 3.079 0.002084 **
## I(1/dist_dc^2) 1.558e-03 1.652e-03 0.943 0.345664
## avg_temp 1.508e-03 2.045e-02 0.074 0.941227
## avg_hum -2.471e-02 6.150e-03 -4.019 5.93e-05 ***
## precip 4.173e-01 5.063e-01 0.824 0.409849
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.256e-08 8.000 0.000
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.759e+00 8.001 4.529
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 4.023e+01 80.000 1.062
## p-value
## s(as.numeric(month)) 0.517
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 1.53e-05 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 9.00e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.0679 Deviance explained = 7.87%
## GCV = 33.746 Scale est. = 33.352 n = 6162
plot(h2s_dm_model_g)



g <- getViz(h2s_dm_model_g)
plot(sm(g, 2)) + l_fitRaster() + l_fitContour() + l_points()

Daily Average
# Look at daily average
h2s_da_model_a <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
Converted_Angle +
s(monitor_lon, monitor_lat, bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + Converted_Angle + s(monitor_lon, monitor_lat,
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.742e+00 9.324e-01 -5.086 3.91e-07 ***
## year2023 -1.515e-01 5.680e-02 -2.667 0.007704 **
## as.factor(weekday).L 8.212e-02 2.360e-02 3.480 0.000509 ***
## as.factor(weekday).Q -1.599e-01 2.367e-02 -6.755 1.75e-11 ***
## as.factor(weekday).C 1.925e-02 2.357e-02 0.817 0.414075
## as.factor(weekday)^4 -6.537e-03 2.364e-02 -0.277 0.782124
## as.factor(weekday)^5 -4.980e-02 2.357e-02 -2.113 0.034702 *
## as.factor(weekday)^6 -4.449e-02 2.363e-02 -1.883 0.059767 .
## wd_avg 7.982e-04 1.096e-04 7.286 4.19e-13 ***
## ws_avg -1.139e-01 7.202e-03 -15.809 < 2e-16 ***
## daily_downwind_ref 2.721e-01 4.253e-02 6.399 1.85e-10 ***
## I(1/MinDist^2) 2.940e-04 3.325e-05 8.844 < 2e-16 ***
## I(1/dist_wrp^2) -1.625e-05 2.391e-06 -6.798 1.31e-11 ***
## capacity 1.565e-02 1.127e-03 13.885 < 2e-16 ***
## Converted_Angle -2.730e-03 9.770e-04 -2.795 0.005234 **
## monthly_oil_1km 6.935e-05 2.246e-05 3.087 0.002043 **
## monthly_gas_1km 4.867e-05 1.016e-04 0.479 0.631782
## active_1km -2.583e-02 1.482e-02 -1.743 0.081492 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.557 8 19.58 <2e-16 ***
## s(monitor_lon,monitor_lat) 8.988 9 85.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 33/35
## R-sq.(adj) = 0.421 Deviance explained = 42.8%
## GCV = 0.21298 Scale est. = 0.21037 n = 2664
plot(h2s_da_model_a)


a <- getViz(h2s_da_model_a)
plot(sm(a, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Look at daily average
h2s_da_model_b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref +
I(1/MinDist^2) + I(1/dist_wrp^2) + capacity +
Converted_Angle +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_b)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + I(1/MinDist^2) + I(1/dist_wrp^2) +
## capacity + Converted_Angle + s(I(mon_utm_x/10^3), I(mon_utm_y/10^3),
## bs = "tp", k = 10) + monthly_oil_1km + monthly_gas_1km +
## active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.291e+00 8.757e-01 -3.758 0.000175 ***
## year2023 -1.515e-01 5.681e-02 -2.667 0.007708 **
## as.factor(weekday).L 8.212e-02 2.360e-02 3.480 0.000509 ***
## as.factor(weekday).Q -1.599e-01 2.367e-02 -6.755 1.76e-11 ***
## as.factor(weekday).C 1.925e-02 2.357e-02 0.817 0.414145
## as.factor(weekday)^4 -6.549e-03 2.364e-02 -0.277 0.781748
## as.factor(weekday)^5 -4.978e-02 2.357e-02 -2.112 0.034749 *
## as.factor(weekday)^6 -4.449e-02 2.363e-02 -1.883 0.059817 .
## wd_avg 7.982e-04 1.095e-04 7.286 4.20e-13 ***
## ws_avg -1.138e-01 7.202e-03 -15.803 < 2e-16 ***
## daily_downwind_ref 2.721e-01 4.252e-02 6.400 1.84e-10 ***
## I(1/MinDist^2) 1.320e-04 1.904e-05 6.931 5.25e-12 ***
## I(1/dist_wrp^2) -9.831e-06 1.268e-06 -7.752 1.29e-14 ***
## capacity 1.202e-02 8.420e-04 14.280 < 2e-16 ***
## Converted_Angle -2.608e-03 9.596e-04 -2.718 0.006616 **
## monthly_oil_1km 6.932e-05 2.247e-05 3.086 0.002053 **
## monthly_gas_1km 4.888e-05 1.016e-04 0.481 0.630348
## active_1km -2.584e-02 1.482e-02 -1.743 0.081413 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.559 8 19.58 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.981 9 85.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 33/35
## R-sq.(adj) = 0.421 Deviance explained = 42.8%
## GCV = 0.21297 Scale est. = 0.21037 n = 2664
plot(h2s_da_model_b)


b <- getViz(h2s_da_model_b)
plot(sm(b, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_c <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.factor(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_c)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.factor(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## monthly_oil_1km + monthly_gas_1km + active_1km
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.092e+00 8.277e-01 -4.944 8.13e-07 ***
## year2023 -1.502e-01 5.687e-02 -2.642 0.008293 **
## as.factor(weekday).L 8.210e-02 2.362e-02 3.475 0.000518 ***
## as.factor(weekday).Q -1.601e-01 2.370e-02 -6.757 1.73e-11 ***
## as.factor(weekday).C 1.946e-02 2.359e-02 0.825 0.409569
## as.factor(weekday)^4 -6.335e-03 2.366e-02 -0.268 0.788954
## as.factor(weekday)^5 -5.012e-02 2.359e-02 -2.124 0.033761 *
## as.factor(weekday)^6 -4.475e-02 2.365e-02 -1.892 0.058579 .
## wd_avg 7.928e-04 1.097e-04 7.230 6.31e-13 ***
## ws_avg -1.152e-01 7.196e-03 -16.005 < 2e-16 ***
## daily_downwind_ref 2.708e-01 4.257e-02 6.362 2.35e-10 ***
## capacity 1.264e-02 8.169e-04 15.474 < 2e-16 ***
## I(1/dist_wrp^2) -1.222e-05 1.314e-06 -9.304 < 2e-16 ***
## monthly_oil_1km 6.987e-05 2.249e-05 3.107 0.001913 **
## monthly_gas_1km 4.838e-05 1.017e-04 0.476 0.634221
## active_1km -2.578e-02 1.484e-02 -1.737 0.082429 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.558 8 19.47 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.992 9 90.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 32/33
## R-sq.(adj) = 0.42 Deviance explained = 42.6%
## GCV = 0.21339 Scale est. = 0.21086 n = 2664
plot(h2s_da_model_c)


c <- getViz(h2s_da_model_c)
plot(sm(c, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_d <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_d)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## monthly_oil_1km + monthly_gas_1km + active_1km + daily_downwind_wrp +
## elevation + EVI
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.080e+00 1.012e+00 1.068 0.28574
## year2023 -1.517e-01 5.678e-02 -2.671 0.00760 **
## as.character(weekday)Mon -9.035e-02 3.341e-02 -2.705 0.00688 **
## as.character(weekday)Sat -9.858e-02 3.336e-02 -2.955 0.00315 **
## as.character(weekday)Sun -1.974e-01 3.323e-02 -5.940 3.23e-09 ***
## as.character(weekday)Thu -4.795e-02 3.340e-02 -1.436 0.15121
## as.character(weekday)Tue -8.359e-03 3.317e-02 -0.252 0.80104
## as.character(weekday)Wed 3.880e-02 3.354e-02 1.157 0.24744
## wd_avg 7.977e-04 1.095e-04 7.283 4.30e-13 ***
## ws_avg -1.143e-01 7.206e-03 -15.858 < 2e-16 ***
## daily_downwind_ref 2.737e-01 4.248e-02 6.444 1.38e-10 ***
## capacity 1.097e-03 1.436e-03 0.764 0.44488
## I(1/dist_wrp^2) -8.660e-08 5.329e-08 -1.625 0.10426
## monthly_oil_1km 6.917e-05 2.246e-05 3.081 0.00209 **
## monthly_gas_1km 4.965e-05 1.015e-04 0.489 0.62483
## active_1km -2.562e-02 1.481e-02 -1.730 0.08375 .
## daily_downwind_wrp 5.139e-02 4.297e-02 1.196 0.23182
## elevation -1.412e-02 8.007e-03 -1.764 0.07790 .
## EVI -1.067e+00 1.446e-01 -7.374 2.21e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 7.553 8.000 19.57 <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 7.916 7.997 51.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 35/36
## R-sq.(adj) = 0.421 Deviance explained = 42.8%
## GCV = 0.21302 Scale est. = 0.21034 n = 2664
plot(h2s_da_model_d)


d <- getViz(h2s_da_model_d)
plot(sm(d, 2)) + l_fitRaster() + l_fitContour() + l_points()

h2s_da_model_e <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_e)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.705e+00 7.312e-01 -5.067 4.33e-07 ***
## year2023 3.857e-01 1.227e-01 3.144 0.001688 **
## as.character(weekday)Mon -9.050e-02 2.665e-02 -3.396 0.000695 ***
## as.character(weekday)Sat -9.599e-02 2.661e-02 -3.607 0.000316 ***
## as.character(weekday)Sun -1.993e-01 2.653e-02 -7.513 7.97e-14 ***
## as.character(weekday)Thu -4.584e-02 2.663e-02 -1.721 0.085311 .
## as.character(weekday)Tue -1.143e-02 2.646e-02 -0.432 0.665634
## as.character(weekday)Wed 3.472e-02 2.675e-02 1.298 0.194507
## wd_avg 6.304e-04 9.123e-05 6.910 6.10e-12 ***
## ws_avg -1.064e-01 6.042e-03 -17.612 < 2e-16 ***
## daily_downwind_ref 2.073e-01 3.439e-02 6.029 1.89e-09 ***
## capacity 1.147e-02 1.482e-03 7.741 1.41e-14 ***
## I(1/dist_wrp^2) 1.399e-07 1.185e-07 1.181 0.237685
## monthly_oil_1km 1.385e-04 4.413e-05 3.138 0.001719 **
## monthly_gas_1km 3.211e-04 2.507e-04 1.281 0.200372
## active_1km -4.537e-02 1.792e-02 -2.532 0.011400 *
## daily_downwind_wrp 5.485e-02 3.489e-02 1.572 0.116065
## elevation -2.933e-02 1.071e-02 -2.738 0.006223 **
## EVI -6.902e-01 1.770e-01 -3.900 9.86e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.841 8.000 12.37
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.471 8.471 21.76
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.879 76.000 20.86
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 110/112
## R-sq.(adj) = 0.633 Deviance explained = 64.8%
## GCV = 0.13916 Scale est. = 0.13343 n = 2664
plot(h2s_da_model_e)



e <- getViz(h2s_da_model_e)
plot(sm(e, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
#
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Since feb 2022
h2s_da_model_f <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(day >= '2022-02-01'))
summary(h2s_da_model_f)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.696e+00 7.111e-01 -6.603 4.88e-11 ***
## year2023 4.435e-01 1.232e-01 3.600 0.000324 ***
## as.character(weekday)Mon -8.809e-02 2.550e-02 -3.454 0.000561 ***
## as.character(weekday)Sat -9.022e-02 2.540e-02 -3.552 0.000389 ***
## as.character(weekday)Sun -2.061e-01 2.542e-02 -8.108 7.92e-16 ***
## as.character(weekday)Thu -3.686e-02 2.542e-02 -1.450 0.147173
## as.character(weekday)Tue -8.548e-03 2.527e-02 -0.338 0.735230
## as.character(weekday)Wed 2.998e-02 2.555e-02 1.173 0.240810
## wd_avg 3.508e-04 9.030e-05 3.885 0.000105 ***
## ws_avg -7.855e-02 6.026e-03 -13.035 < 2e-16 ***
## daily_downwind_ref 1.567e-01 3.302e-02 4.744 2.21e-06 ***
## capacity 1.003e-02 1.427e-03 7.031 2.63e-12 ***
## I(1/dist_wrp^2) 2.122e-07 1.133e-07 1.873 0.061128 .
## monthly_oil_1km 1.611e-04 4.233e-05 3.805 0.000145 ***
## monthly_gas_1km -1.306e-04 2.429e-04 -0.538 0.590730
## active_1km -9.257e-03 1.728e-02 -0.536 0.592319
## daily_downwind_wrp 6.698e-02 3.329e-02 2.012 0.044311 *
## elevation -2.960e-02 1.022e-02 -2.897 0.003804 **
## EVI -7.131e-01 1.698e-01 -4.200 2.76e-05 ***
## num_odor_complaints 2.552e-02 1.252e-02 2.038 0.041693 *
## I(1/dist_dc^2) 2.080e-05 9.122e-06 2.281 0.022659 *
## avg_temp 1.206e-02 2.685e-03 4.491 7.42e-06 ***
## avg_hum -5.599e-03 7.027e-04 -7.968 2.42e-15 ***
## precip -7.076e-02 3.637e-02 -1.945 0.051851 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.854 8.000 13.30
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.471 8.471 23.66
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 75.878 76.000 23.14
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 114/117
## R-sq.(adj) = 0.666 Deviance explained = 68%
## GCV = 0.12669 Scale est. = 0.12128 n = 2664
plot(h2s_da_model_f)



f <- getViz(h2s_da_model_f)
plot(sm(f, 2)) + l_fitRaster() + l_fitContour() + l_points()

# plotRGL(sm(d, 3), fix = c(`as.numeric(day)` = 0), residuals = F)
#
# # try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_e), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Disaster only
h2s_da_model_g <- gam(H2S_daily_avg ~ month + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(year == '2021', month %in% c('10', '11', '12')))
summary(h2s_da_model_g)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ month + as.character(weekday) + wd_avg + ws_avg +
## daily_downwind_ref + capacity + I(1/dist_wrp^2) + s(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), bs = "tp", k = 10) + te(I(mon_utm_x/10^3),
## I(mon_utm_y/10^3), as.numeric(day), k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km +
## active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints +
## I(1/dist_dc^2) + avg_temp + avg_hum + precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2213924 0.0934139 -2.370 0.01804 *
## month11 -4.2142854 1.7782277 -2.370 0.01804 *
## month12 1.1011938 0.4647034 2.370 0.01806 *
## as.character(weekday)Mon -2.7259172 2.9255211 -0.932 0.35176
## as.character(weekday)Sat -2.8891542 2.8298781 -1.021 0.30761
## as.character(weekday)Sun -2.4956249 2.8514127 -0.875 0.38173
## as.character(weekday)Thu -0.3339398 2.8872365 -0.116 0.90795
## as.character(weekday)Tue -4.5461381 2.9772550 -1.527 0.12720
## as.character(weekday)Wed -1.2618609 2.9362292 -0.430 0.66750
## wd_avg 0.0080694 0.0093628 0.862 0.38904
## ws_avg -0.0079586 0.7510760 -0.011 0.99155
## daily_downwind_ref -2.7311456 2.7658732 -0.987 0.32375
## capacity 0.7818623 0.0804573 9.718 < 2e-16 ***
## I(1/dist_wrp^2) -0.0001603 0.0000167 -9.600 < 2e-16 ***
## monthly_oil_1km -0.0058962 0.0097159 -0.607 0.54413
## monthly_gas_1km 0.0147016 0.0338346 0.435 0.66404
## active_1km -7.2923513 3.0769295 -2.370 0.01804 *
## daily_downwind_wrp -0.8402903 3.3511018 -0.251 0.80208
## elevation 2.7201937 0.8943859 3.041 0.00244 **
## EVI 90.9526953 11.8445995 7.679 5.05e-14 ***
## num_odor_complaints 0.5226646 0.0988284 5.289 1.62e-07 ***
## I(1/dist_dc^2) 1.0786879 0.1100253 9.804 < 2e-16 ***
## avg_temp 0.3827439 0.2785720 1.374 0.16987
## avg_hum 0.0272100 0.0665065 0.409 0.68256
## precip -2.5913331 2.9921008 -0.866 0.38674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.926 8.989 12.392
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 52.823 80.000 3.689
## p-value
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 109/114
## R-sq.(adj) = 0.501 Deviance explained = 55%
## GCV = 476.76 Scale est. = 429.63 n = 827
plot(h2s_da_model_g)


g <- getViz(h2s_da_model_g)
plot(sm(g, 1)) + l_fitRaster() + l_fitContour() + l_points()

# Exclude disaster
h2s_da_model_h <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))))
summary(h2s_da_model_h)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.523e+00 1.295e+00 -3.493 0.000482 ***
## year2021 3.333e-01 9.799e-02 3.402 0.000674 ***
## year2022 -5.309e-01 2.081e-01 -2.551 0.010766 *
## year2023 -6.752e-01 2.155e-01 -3.133 0.001738 **
## as.character(weekday)Mon -5.458e-02 2.341e-02 -2.331 0.019771 *
## as.character(weekday)Sat -8.351e-02 2.322e-02 -3.596 0.000325 ***
## as.character(weekday)Sun -1.702e-01 2.328e-02 -7.313 2.96e-13 ***
## as.character(weekday)Thu -1.154e-02 2.329e-02 -0.496 0.620154
## as.character(weekday)Tue -1.368e-02 2.326e-02 -0.588 0.556374
## as.character(weekday)Wed 1.616e-02 2.341e-02 0.690 0.490096
## wd_avg 1.512e-04 8.622e-05 1.754 0.079466 .
## ws_avg -7.304e-02 5.490e-03 -13.304 < 2e-16 ***
## daily_downwind_ref 2.274e-02 2.396e-02 0.949 0.342705
## capacity 1.873e-02 2.376e-03 7.884 3.73e-15 ***
## I(1/dist_wrp^2) -4.082e-06 1.369e-06 -2.983 0.002870 **
## monthly_oil_1km -4.735e-05 1.804e-05 -2.625 0.008694 **
## monthly_gas_1km -2.152e-04 7.046e-05 -3.054 0.002271 **
## active_1km -4.449e-02 7.093e-03 -6.273 3.80e-10 ***
## daily_downwind_wrp 2.858e-02 2.842e-02 1.006 0.314673
## elevation 7.088e-02 1.153e-02 6.149 8.28e-10 ***
## EVI 9.990e-01 2.838e-01 3.520 0.000435 ***
## num_odor_complaints 2.815e-02 1.119e-02 2.516 0.011891 *
## I(1/dist_dc^2) 3.415e-02 8.835e-03 3.865 0.000112 ***
## avg_temp 5.698e-03 2.075e-03 2.747 0.006037 **
## avg_hum -6.004e-03 5.830e-04 -10.299 < 2e-16 ***
## precip -3.164e-02 4.322e-02 -0.732 0.464151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 7.92 8 44.56
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.00 9 24.04
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 74.52 80 23.17
## p-value
## s(as.numeric(month)) <2e-16 ***
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.462 Deviance explained = 47.2%
## GCV = 0.24227 Scale est. = 0.23773 n = 6162
plot(h2s_da_model_h)



h <- getViz(h2s_da_model_h)
plot(sm(h, 2)) + l_fitRaster() + l_fitContour() + l_points()

# Disaster indicator
h2s_da_model_i <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip + disaster,
data = daily_full %>% mutate(disaster = if_else(year == '2021',
month %in% c('10', '11', '12'), 1, 0)))
summary(h2s_da_model_i)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip + disaster
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.573e+02 1.399e+01 -25.530 < 2e-16 ***
## year2021 3.989e+00 1.666e+00 2.394 0.01667 *
## year2022 5.441e+00 2.075e+00 2.622 0.00875 **
## year2023 4.519e+00 2.475e+00 1.826 0.06786 .
## as.character(weekday)Mon -3.258e-01 3.764e-01 -0.866 0.38676
## as.character(weekday)Sat -2.928e-01 3.732e-01 -0.785 0.43272
## as.character(weekday)Sun -3.098e-01 3.734e-01 -0.830 0.40675
## as.character(weekday)Thu -2.927e-01 3.745e-01 -0.782 0.43448
## as.character(weekday)Tue -6.662e-01 3.743e-01 -1.780 0.07516 .
## as.character(weekday)Wed -3.629e-01 3.757e-01 -0.966 0.33414
## wd_avg 1.248e-03 1.348e-03 0.926 0.35423
## ws_avg -7.136e-02 8.683e-02 -0.822 0.41120
## daily_downwind_ref -1.815e-02 3.815e-01 -0.048 0.96205
## capacity 6.881e-01 2.416e-02 28.477 < 2e-16 ***
## I(1/dist_wrp^2) -2.730e-04 1.995e-05 -13.687 < 2e-16 ***
## monthly_oil_1km 3.638e-04 2.667e-04 1.364 0.17256
## monthly_gas_1km 7.303e-04 9.757e-04 0.748 0.45420
## active_1km 9.149e-02 9.378e-02 0.976 0.32928
## daily_downwind_wrp 4.554e-01 4.533e-01 1.005 0.31508
## elevation 2.536e+00 1.487e-01 17.053 < 2e-16 ***
## EVI 7.930e+01 3.021e+00 26.253 < 2e-16 ***
## num_odor_complaints 9.522e-01 2.756e-02 34.552 < 2e-16 ***
## I(1/dist_dc^2) 2.030e+00 1.117e-01 18.171 < 2e-16 ***
## avg_temp 1.727e-02 3.066e-02 0.563 0.57323
## avg_hum -2.637e-04 8.511e-03 -0.031 0.97529
## precip -4.241e-02 5.876e-01 -0.072 0.94246
## disaster 3.211e+00 1.026e+00 3.130 0.00176 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 1.783 8.000 0.327
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 8.957 8.997 94.223
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 79.836 80.000 6.382
## p-value
## s(as.numeric(month)) 0.186
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 122/124
## R-sq.(adj) = 0.331 Deviance explained = 34.2%
## GCV = 70.74 Scale est. = 69.57 n = 6989
plot(h2s_da_model_i)



i <- getViz(h2s_da_model_i)
plot(sm(i, 2)) + l_fitRaster() + l_fitContour() + l_points()

plotRGL(sm(i, 2), fix = c(`as.numeric(day)` = 0), residuals = F)
# try a plot using wireframe
# plot3d <- matrix(fitted(h2s_da_model_i), ncol = 20)
# wireframe(plot3d, drape=TRUE, colorkey=TRUE)
# Everything
h2s_da_model_j <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + capacity +
I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3),as.numeric(day),
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
I(1/dist_dc^2) + avg_temp + avg_hum + precip,
data = daily_full)
summary(h2s_da_model_j)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_daily_avg ~ s(as.numeric(month), bs = "cc") + year + as.character(weekday) +
## wd_avg + ws_avg + daily_downwind_ref + capacity + I(1/dist_wrp^2) +
## s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs = "tp", k = 10) +
## te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
## k = c(10, 10), d = c(2, 1), bs = c("tp", "cc")) + monthly_oil_1km +
## monthly_gas_1km + active_1km + daily_downwind_wrp + elevation +
## EVI + num_odor_complaints + I(1/dist_dc^2) + avg_temp + avg_hum +
## precip
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.496e+02 1.385e+01 -25.237 <2e-16 ***
## year2021 7.052e-01 1.335e+00 0.528 0.5974
## year2022 1.405e+00 1.689e+00 0.832 0.4055
## year2023 6.499e-01 2.180e+00 0.298 0.7656
## as.character(weekday)Mon -3.333e-01 3.772e-01 -0.884 0.3769
## as.character(weekday)Sat -2.687e-01 3.740e-01 -0.719 0.4724
## as.character(weekday)Sun -2.956e-01 3.742e-01 -0.790 0.4297
## as.character(weekday)Thu -3.134e-01 3.753e-01 -0.835 0.4036
## as.character(weekday)Tue -6.738e-01 3.751e-01 -1.796 0.0725 .
## as.character(weekday)Wed -3.640e-01 3.765e-01 -0.967 0.3337
## wd_avg 1.248e-03 1.350e-03 0.925 0.3551
## ws_avg -7.657e-02 8.704e-02 -0.880 0.3790
## daily_downwind_ref -7.636e-02 3.820e-01 -0.200 0.8416
## capacity 6.803e-01 2.395e-02 28.403 <2e-16 ***
## I(1/dist_wrp^2) -2.625e-04 1.033e-05 -25.420 <2e-16 ***
## monthly_oil_1km 4.951e-04 2.608e-04 1.899 0.0577 .
## monthly_gas_1km 3.955e-04 9.490e-04 0.417 0.6769
## active_1km 1.023e-01 9.493e-02 1.077 0.2814
## daily_downwind_wrp 3.830e-01 4.538e-01 0.844 0.3986
## elevation 2.481e+00 1.435e-01 17.287 <2e-16 ***
## EVI 7.803e+01 2.932e+00 26.614 <2e-16 ***
## num_odor_complaints 9.839e-01 2.678e-02 36.738 <2e-16 ***
## I(1/dist_dc^2) 1.967e+00 7.384e-02 26.644 <2e-16 ***
## avg_temp 2.239e-02 3.056e-02 0.733 0.4638
## avg_hum -1.267e-03 8.550e-03 -0.148 0.8822
## precip -1.722e-01 5.868e-01 -0.293 0.7692
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F
## s(as.numeric(month)) 2.676 8 0.630
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) 9.000 9 91.536
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) 68.173 80 5.593
## p-value
## s(as.numeric(month)) 0.0709 .
## s(I(mon_utm_x/10^3),I(mon_utm_y/10^3)) <2e-16 ***
## te(I(mon_utm_x/10^3),I(mon_utm_y/10^3),as.numeric(day)) <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Rank: 121/123
## R-sq.(adj) = 0.328 Deviance explained = 33.8%
## GCV = 70.933 Scale est. = 69.879 n = 6989
plot(h2s_da_model_j)



knitr::kable(tibble(Model = c('Since Feb 2022',
'Disaster Only',
'Exclude Disaster',
'Everything w Disaster Indicator',
'Everything w.o Disaster Indicator'),
'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2),
round(summary(h2s_da_model_j)$r.sq, 2))))
| Since Feb 2022 |
0.67 |
| Disaster Only |
0.50 |
| Exclude Disaster |
0.46 |
| Everything w Disaster Indicator |
0.33 |
| Everything w.o Disaster Indicator |
0.33 |
80/20 Cross Validation on Daily Average models
result <- tibble(Model = character(),
'80/20 Train R-Sq' = numeric(),
'80/20 Test R-Sq' = numeric())
validation_result_8020 <- tibble(Model = character(),
'Coef' = numeric(),
'R-Sq' = numeric())
predictors <- c('H2S_daily_avg', 'month', 'year', 'weekday', 'wd_avg', 'ws_avg', 'daily_downwind_ref', 'dist_wrp',
'mon_utm_x', 'mon_utm_y', 'day', 'monthly_oil_1km', 'monthly_gas_1km',
'active_1km', 'daily_downwind_wrp', 'elevation', 'EVI', 'num_odor_complaints',
'dist_dc', 'capacity', 'avg_temp', 'avg_hum', 'precip')
set.seed(123)
# model 1: since Feb 2022
train <- daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
slice_sample(prop = 0.8)
test <- anti_join(daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f2 <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f2, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2)$n - 1)/
(summary(h2s_da_model_f2)$n - summary(h2s_da_model_f2)$np - 1)
result <- rbind(result, tibble(Model = 'Since Feb 2022',
'80/20 Train R-Sq' = summary(h2s_da_model_f2)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
f2_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 GAM') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020,
tibble(Model = 'Since Feb 2022',
'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$coefficients[2, 1],
'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$r.squared))
# Model 2: Disaster (Oct-Dec 2021) only
train <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
slice_sample(prop = 0.8)
test <- anti_join(daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f3 <- gam(H2S_daily_avg ~ month + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f3, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3)$n - 1)/
(summary(h2s_da_model_f3)$n - summary(h2s_da_model_f3)$np - 1)
result <- rbind(result, tibble(Model = 'Disaster Only',
'80/20 Train R-Sq' = summary(h2s_da_model_f3)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
f3_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only GAM') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020,
tibble(Model = 'Disaster Only',
'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$coefficients[2, 1],
'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$r.squared))
# Model 3: Excluding Disaster
train <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
slice_sample(prop = 0.8)
test <- anti_join(daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f4 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f4, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4)$n - 1)/
(summary(h2s_da_model_f4)$n - summary(h2s_da_model_f4)$np - 1)
result <- rbind(result, tibble(Model = 'Exclude Disaster',
'80/20 Train R-Sq' = summary(h2s_da_model_f4)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
# Model 4: Everything with Disaster Indicator
train <- daily_full %>%
select(all_of(predictors)) %>%
mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>%
filter(complete.cases(.))
train <- rbind(
train %>% filter(disaster==1) %>% slice_sample(prop = 0.8),
train %>% filter(disaster==0) %>% slice_sample(prop = 0.8)
)
test <- anti_join(daily_full %>%
select(all_of(predictors)) %>%
mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip, disaster)`
h2s_da_model_f5 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
disaster +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f5, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5)$n - 1)/
(summary(h2s_da_model_f5)$n - summary(h2s_da_model_f5)$np - 1)
result <- rbind(result, tibble(Model = 'Everything w. Disaster Indicator',
'80/20 Train R-Sq' = summary(h2s_da_model_f5)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
# Model 5: Everything
train <- daily_full %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)) %>%
slice_sample(prop = 0.8)
train <- rbind(
train %>% filter(year == '2021' & month %in% c('10', '11', '12')) %>% slice_sample(prop = 0.8),
train %>% filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>% slice_sample(prop = 0.8)
)
test <- anti_join(daily_full %>%
select(all_of(predictors)) %>%
filter(complete.cases(.)), train)
## Joining with `by = join_by(H2S_daily_avg, month, year, weekday, wd_avg, ws_avg,
## daily_downwind_ref, dist_wrp, mon_utm_x, mon_utm_y, day, monthly_oil_1km,
## monthly_gas_1km, active_1km, daily_downwind_wrp, elevation, EVI,
## num_odor_complaints, dist_dc, capacity, avg_temp, avg_hum, precip)`
h2s_da_model_f6 <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f6, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6)$n - 1)/
(summary(h2s_da_model_f5)$n - summary(h2s_da_model_f6)$np - 1)
result <- rbind(result, tibble(Model = 'Everything w.o Disaster Indicator',
'80/20 Train R-Sq' = summary(h2s_da_model_f6)$r.sq,
'80/20 Test R-Sq' = adj_r2_test))
f6_obs_vs_pred_plot <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator Gam') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
f6_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(NA, 4), ylim = c(NA, 4)) +
theme_bw()
validation_result_8020 <- rbind(validation_result_8020,
tibble(Model = 'Everything',
'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$coefficients[2, 1],
'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$r.squared))
knitr::kable(tibble(Model = c('Since Feb 2022',
'Disaster Only',
'Exclude Disaster',
'Everything w Disaster Indicator',
'Everything w.o Disaster Indicator'),
'R-Sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2),
round(summary(h2s_da_model_j)$r.sq, 2))) %>%
left_join(result, join_by(Model)))
| Since Feb 2022 |
0.67 |
0.6538821 |
0.6823609 |
| Disaster Only |
0.50 |
0.5175137 |
0.2921882 |
| Exclude Disaster |
0.46 |
0.5189897 |
0.2950658 |
| Everything w Disaster Indicator |
0.33 |
NA |
NA |
| Everything w.o Disaster Indicator |
0.33 |
0.3372143 |
0.4443146 |
Cross Validation on each monitor
result_cv <- tibble(Model = character(),
'CV Avg Train R-Sq' = numeric(),
'CV Avg Test R-Sq' = numeric())
# model 1: since Feb 2022
post2022feb <- daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(post2022feb$Monitor)
cv1_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- post2022feb %>%
filter(Monitor != monitors[i])
test <- post2022feb %>%
filter(Monitor == monitors[i])
h2s_da_model_f2b <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f2b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f2b)$n - 1)/
(summary(h2s_da_model_f2b)$n - summary(h2s_da_model_f2b)$np - 1)
cv1_result <- rbind(cv1_result, tibble(monitors[i],
summary(h2s_da_model_f2b)$r.sq,
summary(h2s_da_model_f2b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Since Feb 2022',
'CV Avg Train R-Sq' = mean(cv1_result$`summary(h2s_da_model_f2b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv1_result$adj_r2_test))))
# model 2: Disaster only
disaster <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(disaster$Monitor)
cv2_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- disaster %>%
filter(Monitor != monitors[i])
test <- disaster %>%
filter(Monitor == monitors[i])
h2s_da_model_f3b <- gam(H2S_daily_avg~ month + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f3b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f3b)$n - 1)/
(summary(h2s_da_model_f3b)$n - summary(h2s_da_model_f3b)$np - 1)
cv2_result <- rbind(cv2_result, tibble(monitors[i],
summary(h2s_da_model_f3b)$r.sq,
summary(h2s_da_model_f3b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Disaster Only',
'CV Avg Train R-Sq' = mean(cv2_result$`summary(h2s_da_model_f3b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv2_result$adj_r2_test))))
# model 3: Exclude Disaster
exclude_disaster <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(exclude_disaster$Monitor)
cv3_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- exclude_disaster %>%
filter(Monitor != monitors[i])
test <- exclude_disaster %>%
filter(Monitor == monitors[i])
h2s_da_model_f4b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f4b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f4b)$n - 1)/
(summary(h2s_da_model_f4b)$n - summary(h2s_da_model_f4b)$np - 1)
cv3_result <- rbind(cv3_result, tibble(monitors[i],
summary(h2s_da_model_f4b)$r.sq,
summary(h2s_da_model_f4b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Exclude Disaster',
'CV Avg Train R-Sq' = mean(cv3_result$`summary(h2s_da_model_f4b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv3_result$adj_r2_test))))
# model 4: Everything w Disaster Indicator
full_complete <- daily_full %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.)) %>%
mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0))
monitors <- unique(full_complete$Monitor)
cv4_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- full_complete %>%
filter(Monitor != monitors[i])
test <- full_complete %>%
filter(Monitor == monitors[i])
h2s_da_model_f5b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
disaster + capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f5b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f5b)$n - 1)/
(summary(h2s_da_model_f5b)$n - summary(h2s_da_model_f5b)$np - 1)
cv4_result <- rbind(cv4_result, tibble(monitors[i],
summary(h2s_da_model_f5b)$r.sq,
summary(h2s_da_model_f5b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w. Disaster Indicator',
'CV Avg Train R-Sq' = mean(cv4_result$`summary(h2s_da_model_f5b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv4_result$adj_r2_test))))
# model 5: Everything w.o Disaster Indicator
full_complete <- daily_full %>%
select(all_of(predictors), Monitor) %>%
filter(complete.cases(.))
monitors <- unique(full_complete$Monitor)
cv5_result <- tibble(Monitor = character(),
'Train Adj-R2' = numeric(),
'Train N' = numeric(),
'Test Adj-R2' = numeric(),
'Test B' = numeric()
)
for (i in 1:length(monitors)) {
train <- full_complete %>%
filter(Monitor != monitors[i])
test <- full_complete %>%
filter(Monitor == monitors[i])
h2s_da_model_f6b <- gam(H2S_daily_avg~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_f6b, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_f6b)$n - 1)/
(summary(h2s_da_model_f6b)$n - summary(h2s_da_model_f6b)$np - 1)
cv5_result <- rbind(cv5_result, tibble(monitors[i],
summary(h2s_da_model_f6b)$r.sq,
summary(h2s_da_model_f6b)$n,
adj_r2_test,
nrow(test)))
}
result_cv <- rbind(result_cv, tibble(tibble(Model = 'Everything w.o Disaster Indicator',
'CV Avg Train R-Sq' = mean(cv5_result$`summary(h2s_da_model_f6b)$r.sq`),
'CV Avg Test R-Sq' = mean(cv5_result$adj_r2_test))))
cv1_result
cv2_result
cv3_result
cv4_result
cv5_result
10-fold CV
set.seed(1)
random_seeds <- floor(runif(5, min=0, max=1)*100)
result_10cv <- tibble(Model = character(),
'10CV AVG Train R-Sq' = numeric(),
'10CV AVG Test R-Sq' = numeric())
validation_result_10cv <- tibble(Model = character(),
'Coef' = numeric(),
'R-Sq' = numeric())
# model 1: since Feb 2022
obs_10cv_sincefeb2022 <- c()
pred_10cv_sincefeb2022 <- c()
adj_r2_sincefeb2022 <- c()
adj_r2_test_sincefeb2022 <- c()
sincefeb2022 <- daily_full %>%
filter(day >= '2022-02-01') %>%
select(all_of(predictors)) %>%
filter(complete.cases(.))
set.seed(random_seeds[1])
folds <- createFolds(seq(1, nrow(sincefeb2022)), k = 10)
for (fold in seq(1, 10)) {
test <- sincefeb2022[folds[[fold]], ]
train <- anti_join(sincefeb2022, test, by = join_by(dist_dc, day))
h2s_da_model_10cv <- gam(H2S_daily_avg~s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_10cv, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
(summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
obs_10cv_sincefeb2022 <- append(obs_10cv_sincefeb2022, test %>% pull(H2S_daily_avg))
pred_10cv_sincefeb2022 <- append(pred_10cv_sincefeb2022, predictions)
adj_r2_sincefeb2022 <- append(adj_r2_sincefeb2022, summary(h2s_da_model_10cv)$r.sq)
adj_r2_test_sincefeb2022 <- append(adj_r2_sincefeb2022, adj_r2_test)
}
result_10cv <- rbind(result_10cv, tibble(Model = 'Since Feb 2022',
'10CV AVG Train R-Sq' = mean(adj_r2_sincefeb2022),
'10CV AVG Test R-Sq' = mean(adj_r2_test_sincefeb2022)))
f2_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_sincefeb2022, pred = pred_10cv_sincefeb2022),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 GAM 10CV') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_10cv <- rbind(validation_result_10cv,
tibble(Model = 'Since Feb 2022',
'Coef' = summary(lm(obs_10cv_sincefeb2022 ~
pred_10cv_sincefeb2022))$coefficients[2, 1],
'R-Sq' = summary(lm(obs_10cv_sincefeb2022 ~
pred_10cv_sincefeb2022))$r.squared))
# Model 2: Disaster (Oct-Dec 2021) only
obs_10cv_disaster <- c()
pred_10cv_disaster <- c()
adj_r2_disaster <- c()
adj_r2_test_disaster <- c()
disaster <- daily_full %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.))
set.seed(random_seeds[2])
folds <- createFolds(seq(1, nrow(disaster)), k = 10)
for (fold in seq(1, 10)) {
test <- disaster[folds[[fold]], ]
train <- anti_join(disaster, test, by = join_by(dist_dc, day))
h2s_da_model_10cv <- gam(H2S_daily_avg ~ month + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_10cv, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
(summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
obs_10cv_disaster <- append(obs_10cv_disaster, test %>% pull(H2S_daily_avg))
pred_10cv_disaster <- append(pred_10cv_disaster, predictions)
adj_r2_disaster <- append(adj_r2_disaster, summary(h2s_da_model_10cv)$r.sq)
adj_r2_test_disaster <- append(adj_r2_disaster, adj_r2_test)
}
result_10cv <- rbind(result_10cv, tibble(Model = 'Disaster Only',
'10CV AVG Train R-Sq' = mean(adj_r2_disaster),
'10CV AVG Test R-Sq' = mean(adj_r2_test_disaster)))
f3_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_disaster, pred = pred_10cv_disaster),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only GAM 10CV') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_10cv <- rbind(validation_result_10cv,
tibble(Model = 'Disaster Only',
'Coef' = summary(lm(obs_10cv_disaster ~
pred_10cv_disaster))$coefficients[2, 1],
'R-Sq' = summary(lm(obs_10cv_disaster ~
pred_10cv_disaster))$r.squared))
# Model 3: Excluding Disaster
obs_10cv_excl_disaster <- c()
pred_10cv_excl_disaster <- c()
adj_r2_excl_disaster <- c()
adj_r2_test_excl_disaster <- c()
excl_disaster <- daily_full %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
select(all_of(predictors)) %>%
filter(complete.cases(.))
set.seed(random_seeds[3])
folds <- createFolds(seq(1, nrow(excl_disaster)), k = 10)
for (fold in seq(1, 10)) {
test <- excl_disaster[folds[[fold]], ]
train <- anti_join(excl_disaster, test, by = join_by(dist_dc, day))
h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_10cv, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
(summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
obs_10cv_excl_disaster <- append(obs_10cv_excl_disaster, test %>% pull(H2S_daily_avg))
pred_10cv_excl_disaster <- append(pred_10cv_excl_disaster, predictions)
adj_r2_excl_disaster <- append(adj_r2_excl_disaster, summary(h2s_da_model_10cv)$r.sq)
adj_r2_test_excl_disaster <- append(adj_r2_excl_disaster, adj_r2_test)
}
result_10cv <- rbind(result_10cv, tibble(Model = 'Exclude Disaster',
'10CV AVG Train R-Sq' = mean(adj_r2_excl_disaster),
'10CV AVG Test R-Sq' = mean(adj_r2_test_excl_disaster)))
# Model 4: Everything with Disaster Indicator
obs_10cv_disaster_ind <- c()
pred_10cv_disaster_ind <- c()
adj_r2_disaster_ind <- c()
adj_r2_test_disaster_ind <- c()
disaster_ind <- daily_full %>%
select(all_of(predictors)) %>%
mutate(disaster = if_else(year == '2021' & month %in% c('10', '11', '12'), 1, 0)) %>%
filter(complete.cases(.))
set.seed(random_seeds[4])
folds_1 <- createFolds(seq(1, nrow(disaster_ind %>% filter(disaster==1))), k = 10)
folds_0 <- createFolds(seq(1, nrow(disaster_ind %>% filter(disaster==0))), k = 10)
for (fold in seq(1, 10)) {
test <- rbind(disaster_ind[folds_1[[fold]], ], disaster_ind[folds_0[[fold]], ])
train <- anti_join(disaster_ind, test, by = join_by(dist_dc, day))
h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_10cv, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
(summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
obs_10cv_disaster_ind <- append(obs_10cv_disaster_ind, test %>% pull(H2S_daily_avg))
pred_10cv_disaster_ind <- append(pred_10cv_disaster_ind, predictions)
adj_r2_disaster_ind <- append(adj_r2_disaster_ind, summary(h2s_da_model_10cv)$r.sq)
adj_r2_test_disaster_ind <- append(adj_r2_disaster_ind, adj_r2_test)
}
result_10cv <- rbind(result_10cv, tibble(Model = 'Everything w. Disaster Indicator',
'10CV AVG Train R-Sq' = mean(adj_r2_disaster_ind),
'10CV AVG Test R-Sq' = mean(adj_r2_test_disaster_ind)))
# Model 5: Everything
obs_10cv_everything <- c()
pred_10cv_everything <- c()
adj_r2_everything <- c()
adj_r2_test_everything <- c()
everything <- daily_full %>%
select(all_of(predictors)) %>%
filter(complete.cases(.))
set.seed(random_seeds[5])
folds <- createFolds(seq(1, nrow(everything), k = 10))
## Warning: In seq.default(1, nrow(everything), k = 10) :
## extra argument 'k' will be disregarded
for (fold in seq(1, 10)) {
test <- everything[folds[[fold]], ]
train <- anti_join(everything, test, by = join_by(dist_dc, day))
h2s_da_model_10cv <- gam(H2S_daily_avg ~ s(as.numeric(month),bs='cc') + year + as.character(weekday) +
wd_avg + ws_avg + daily_downwind_ref + I(1/dist_wrp^2) +
s(I(mon_utm_x/10^3), I(mon_utm_y/10^3), bs='tp', k = 10) +
te(I(mon_utm_x/10^3), I(mon_utm_y/10^3), as.numeric(day),
k = c(10,10), d = c(2,1), bs = c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
capacity + dist_dc + avg_temp + avg_hum + precip,
data = train)
predictions <- predict(h2s_da_model_10cv, newdata = test)
r2_test <- R2(test$H2S_daily_avg, predictions)
adj_r2_test <- 1 - (1-r2_test)*(summary(h2s_da_model_10cv)$n - 1)/
(summary(h2s_da_model_10cv)$n - summary(h2s_da_model_10cv)$np - 1)
obs_10cv_everything <- append(obs_10cv_everything, test %>% pull(H2S_daily_avg))
pred_10cv_everything <- append(pred_10cv_everything, predictions)
adj_r2_everything <- append(adj_r2_everything, summary(h2s_da_model_10cv)$r.sq)
adj_r2_test_everything <- append(adj_r2_everything, adj_r2_test)
}
result_10cv <- rbind(result_10cv, tibble(Model = 'Everything w.o Disaster Indicator',
'10CV AVG Train R-Sq' = mean(adj_r2_everything),
'10CV AVG Test R-Sq' = mean(adj_r2_test_everything)))
f6_10cv_obs_vs_pred_plot <- ggplot(tibble(obs = obs_10cv_everything, pred = pred_10cv_everything),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator GAM CV') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
f6_10cv_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_10cv_everything, pred = pred_10cv_everything),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(NA, 15), ylim = c(NA, 50)) +
theme_bw()
validation_result_10cv <- rbind(validation_result_10cv,
tibble(Model = 'Everything',
'Coef' = summary(lm(obs_10cv_everything ~
pred_10cv_everything))$coefficients[2, 1],
'R-Sq' = summary(lm(obs_10cv_everything ~
pred_10cv_everything))$r.squared))
GAM result
knitr::kable(tibble(Model = c('Since Feb 2022',
'Disaster Only',
'Exclude Disaster',
'Everything w. Disaster Indicator',
'Everything w.o Disaster Indicator'),
'Train R-sq' = c(round(summary(h2s_da_model_f)$r.sq, 2),
round(summary(h2s_da_model_g)$r.sq, 2),
round(summary(h2s_da_model_h)$r.sq, 2),
round(summary(h2s_da_model_i)$r.sq, 2),
round(summary(h2s_da_model_j)$r.sq, 2))) %>%
left_join(result, join_by(Model)) %>%
left_join(result_cv, join_by(Model)) %>%
left_join(result_10cv, join_by(Model)),
digits = 3)
| Since Feb 2022 |
0.67 |
0.654 |
0.682 |
0.667 |
0.236 |
0.664 |
0.665 |
| Disaster Only |
0.50 |
0.518 |
0.292 |
0.671 |
0.094 |
0.497 |
0.439 |
| Exclude Disaster |
0.46 |
0.519 |
0.295 |
0.477 |
0.085 |
0.463 |
0.446 |
| Everything w. Disaster Indicator |
0.33 |
0.386 |
0.612 |
0.498 |
0.115 |
0.321 |
0.327 |
| Everything w.o Disaster Indicator |
0.33 |
0.337 |
0.444 |
0.496 |
0.107 |
0.329 |
0.341 |
Observed vs. Predicted plots
ggarrange(f2_obs_vs_pred_plot,
f3_obs_vs_pred_plot,
ggarrange(f6_obs_vs_pred_plot, f6_obs_vs_pred_plot_zoom, ncol = 2, labels = c("3", "4")),
labels = c("1", "2"),
nrow = 3)

knitr::kable(validation_result_8020, digits = 3)
| Since Feb 2022 |
0.966 |
0.700 |
| Disaster Only |
0.761 |
0.414 |
| Everything |
1.201 |
0.321 |
ggarrange(f2_10cv_obs_vs_pred_plot,
f3_10cv_obs_vs_pred_plot,
ggarrange(f6_10cv_obs_vs_pred_plot, f6_10cv_obs_vs_pred_plot_zoom, ncol = 2, labels = c("3", "4")),
labels = c("1", "2"),
nrow = 3)

knitr::kable(validation_result_10cv, digits = 3)
| Since Feb 2022 |
0.980 |
0.649 |
| Disaster Only |
0.951 |
0.456 |
| Everything |
0.865 |
0.271 |
Monthly
h2s_ma_model_a <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery, data = full_data)
summary(h2s_ma_model_a)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## H2S_monthly_average ~ s(as.numeric(month), bs = "cc") + year +
## wd_sec + ws + I(1/MinDist^2) + Refinery
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.469e+00 3.010e-01 4.879 1.07e-06 ***
## year2021 4.569e+00 1.704e-01 26.817 < 2e-16 ***
## year2022 -4.981e+00 1.895e-01 -26.285 < 2e-16 ***
## year2023 -2.272e+00 2.389e-01 -9.510 < 2e-16 ***
## wd_secQ2 -9.341e-02 1.781e-01 -0.524 0.6
## wd_secQ3 1.662e+00 1.811e-01 9.179 < 2e-16 ***
## wd_secQ4 -8.799e-01 1.712e-01 -5.140 2.74e-07 ***
## ws 7.904e-02 1.924e-02 4.109 3.98e-05 ***
## I(1/MinDist^2) 9.127e+05 1.544e+05 5.911 3.41e-09 ***
## RefineryMarathon (Carson) 2.223e+01 2.465e-01 90.170 < 2e-16 ***
## RefineryMarathon (Wilmington) 3.672e+00 2.867e-01 12.808 < 2e-16 ***
## RefineryPhillips 66 (Wilmington) 2.444e+00 2.532e-01 9.652 < 2e-16 ***
## RefineryTorrance Refinery -1.154e+00 2.332e-01 -4.949 7.48e-07 ***
## RefineryValero Refinery 5.945e+00 2.804e-01 21.202 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(as.numeric(month)) 8 8 5716 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0357 Deviance explained = 3.57%
## GCV = 5419.9 Scale est. = 5419.9 n = 2056378
plot(h2s_ma_model_a)

# h2s_ma_model_b <- gam(H2S_monthly_average~s(as.numeric(month),bs='cc')+year+wd_sec+ws+I(1/MinDist^2)+Refinery+s(latitude, longitude, bs='tp', k=10), data = full_data)
# summary(h2s_ma_model_b)
# plot(h2s_ma_model_b)
XGBoost Since 2022 Feb
validation_result <- tibble(Model = character(),
'Coef' = character(),
'R-Sq' = numeric())
xgb_result <- tibble(Model = character(),
'R-Sq' = numeric(),
'80/20 Train R-Sq' = numeric(),
'80/20 Test R-Sq' = numeric())
daily_full <- daily_full %>%
mutate(Refinery = str_replace_all(str_replace_all(Refinery, '[()]', ''), ' ', '_'),
Monitor = str_replace_all(Monitor, ' ', '_'),
weekday = as.character(weekday),
daily_downwind_ref = as.integer(daily_downwind_ref),
daily_downwind_wrp = as.integer(daily_downwind_wrp))
train <- daily_full[complete.cases(daily_full),] %>%
filter(day >= '2022-02-01')
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
max_depth = c(3, 4, 5),
eta = c(0.1, 0.3),
gamma = c(0.01, 0.001),
colsample_bytree = c(0.5, 1),
min_child_weight = 0,
subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv",
number=10,
verboseIter=TRUE,
search='grid')
fit.xgb_da <- readRDS('rfiles/fit.xgb_da.rds')
# fit.xgb_da <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da, 'rfiles/fit.xgb_da.rds')
getTrainPerf(fit.xgb_da)
fit.xgb_da$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 42
## niter: 500
## nfeatures : 42
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96 500 5 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : span too small. fewer data values than degrees of freedom.
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 40.97
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 2.03
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 4.1209
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da$finalModel, top_n = 20)

80/20 CV
set.seed(313)
validation_result_8020 <- tibble(Model = character(),
'Coef' = character(),
'R-Sq' = numeric())
train <- daily_full[complete.cases(daily_full),] %>%
filter(day >= '2022-02-01') %>%
slice_sample(prop = 0.8)
test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(day >= '2022-02-01'),
train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
test <- fastDummies::dummy_cols(test %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
# Try for a continuous month
tune_grid <- expand.grid(nrounds = c(100, 200, 500),
max_depth = c(3, 4, 5),
eta = c(0.1, 0.3),
gamma = c(0.01, 0.001),
colsample_bytree = c(0.5, 1),
min_child_weight = 0,
subsample = c(0.5, 0.75, 1))
# Run algorithms using 10-fold cross validation
control <- trainControl(method="cv",
number=10,
verboseIter=TRUE,
search='grid')
fit.xgb_da_8020 <- readRDS('rfiles/fit.xgb_da_8020.rds')
# fit.xgb_da_8020 <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_8020, 'rfiles/fit.xgb_da_8020.rds')
getTrainPerf(fit.xgb_da_8020)
# Test statistics
predictions <- predict(fit.xgb_da_8020$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions, test %>% pull(H2S_daily_avg))
test_stat
## RMSE Rsquared MAE
## 0.2835988 0.7685410 0.1748351
xgb_result <- rbind(xgb_result,
tibble(Model = 'Since Feb 2022',
'R-Sq' = getTrainPerf(fit.xgb_da)$TrainRsquared,
'80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_8020)$TrainRsquared,
'80/20 Test R-Sq' = test_stat[[2]]))
xgb_sincefeb2022_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Since 2022 XGBoost') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result_8020 <- rbind(validation_result_8020,
tibble(Model = 'Since Feb 2022',
'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$coefficients[2, 1],
'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$r.squared))
fit.xgb_da_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 42
## niter: 500
## nfeatures : 42
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints year_2022 year_2023 month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96 500 5 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_8020$finalModel, top_n = 12, n_col = 3)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 3.83e+05
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 4000
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : Chernobyl! trL>n 9
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : Chernobyl! trL>n 9
## Warning in sqrt(sum.squares/one.delta): NaNs produced

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_8020$finalModel, top_n = 20)

CV by leaving each monitor out once
post2022feb <- daily_full %>%
filter(day >= '2022-02-01')
monitor_names <- unique(post2022feb$Monitor)
# for (i in 1:length(monitor_names)) {
# train <- post2022feb %>%
# filter(Monitor != monitor_names[i])
#
# train <- train[complete.cases(train),]
#
# train <- fastDummies::dummy_cols(train %>%
# select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
# monitor_lat, monitor_lon, county, dist_213)) %>%
# mutate(MinDist = 1/(MinDist^2),
# dist_wrp = 1/(dist_wrp^2)),
# remove_selected_columns = TRUE)
#
# fit.xgb_da <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
#
# saveRDS(fit.xgb_da, paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
# }
model_stats <- tibble(Monitor = character(), train_r2 = numeric(), test_r2 = numeric())
add_cols <- function(df, cols) {
add <- cols[!cols %in% names(df)]
if(length(add) !=0 ) df[add] <- NA
return(df)
}
for (i in setdiff(1:length(monitor_names), c(9))) {
fit.xgb_da <- readRDS(paste0('rfiles/fit.xgb_da_', monitor_names[i], '.rds'))
test <- post2022feb %>%
filter(Monitor == monitor_names[i])
test <- fastDummies::dummy_cols(test[complete.cases(test),] %>%
ungroup() %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213)) %>%
add_cols(c('year_2022', 'year_2023', 'month_01', 'month_02',
'month_03', 'month_04', 'month_05', 'month_06',
'month_07', 'month_08', 'month_09', 'month_10',
'month_11', 'month_12')) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2),
year_2022 = ifelse(is.na(year_2022), 0, year_2022),
year_2023 = ifelse(is.na(year_2023), 0, year_2023),
month_01 = ifelse(is.na(month_01), 0, month_01),
month_02 = ifelse(is.na(month_02), 0, month_02),
month_03 = ifelse(is.na(month_03), 0, month_03),
month_04 = ifelse(is.na(month_04), 0, month_04),
month_05 = ifelse(is.na(month_05), 0, month_05),
month_06 = ifelse(is.na(month_06), 0, month_06),
month_07 = ifelse(is.na(month_07), 0, month_07),
month_08 = ifelse(is.na(month_08), 0, month_08),
month_09 = ifelse(is.na(month_09), 0, month_09),
month_10 = ifelse(is.na(month_10), 0, month_10),
month_11 = ifelse(is.na(month_11), 0, month_11),
month_12 = ifelse(is.na(month_12), 0, month_12)),
remove_selected_columns = TRUE)
train_r2 <- getTrainPerf(fit.xgb_da)$TrainRsquared
predictions <- predict(fit.xgb_da$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_r2 <- R2(test %>% pull(H2S_daily_avg), predictions)
model_stats <- rbind(model_stats, tibble(Monitor = monitor_names[i], train_r2, test_r2))
}
model_stats
tibble(Monitor = 'Total Average', train_r2 = mean(model_stats$train_r2, na.rm = T),
test_r2 = mean(model_stats$test_r2, na.rm = T))
XGBoost Disaster
Daily average model
train <- daily_full[complete.cases(daily_full),] %>%
filter(year == '2021', month %in% c('10', '11', '12'))
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_dis <- readRDS('rfiles/fit.xgb_da_dis.rds')
# fit.xgb_da_dis <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis, 'rfiles/fit.xgb_da_dis.rds')
getTrainPerf(fit.xgb_da_dis)
fit.xgb_da_dis$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 31
## niter: 500
## nfeatures : 31
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 96 500 5 0.1 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold
obs_xgb_disaster <- c()
pred_xgb_disaster <- c()
r2_test_xgb_disaster <- c()
for (fold in seq(1, 10)) {
test_fold <- train[fit.xgb_da_dis$control$indexOut[[fold]], ]
test_pred <- predict(fit.xgb_da_dis$finalModel,
newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
obs_xgb_disaster <- append(obs_xgb_disaster, test_fold %>% pull(H2S_daily_avg))
pred_xgb_disaster <- append(pred_xgb_disaster, test_pred)
r2_test_xgb_disaster <- append(r2_test_xgb_disaster, test_r2)
}
xgb_disaster_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_disaster, pred = pred_xgb_disaster),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for Disaster Only XGBoost') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
xgb_disaster_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_xgb_disaster, pred = pred_xgb_disaster),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 50), ylim = c(0, 50)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Disaster Only',
'Coef' = summary(lm(obs_xgb_disaster ~
pred_xgb_disaster))$coefficients[2, 1],
'R-Sq' = summary(lm(obs_xgb_disaster ~
pred_xgb_disaster))$r.squared))
SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_dis$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_dis$finalModel, top_n = 20)

80/20 CV
set.seed(313)
train <- daily_full[complete.cases(daily_full),] %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
slice_sample(prop = 0.8)
test <- anti_join(daily_full[complete.cases(daily_full),] %>% filter(year == '2021', month %in% c('10', '11', '12')),
train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
test <- fastDummies::dummy_cols(test %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_dis_8020 <- readRDS('rfiles/fit.xgb_da_dis_8020.rds')
# fit.xgb_da_dis_8020 <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_dis_8020, 'rfiles/fit.xgb_da_dis_8020.rds')
getTrainPerf(fit.xgb_da_dis_8020)
# Test statistics
predictions <- predict(fit.xgb_da_dis_8020$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions,
test %>% pull(H2S_daily_avg))
test_stat
## RMSE Rsquared MAE
## 23.4073974 0.6423252 6.3878800
xgb_result <- rbind(xgb_result,
tibble(Model = 'Disaster Only',
'R-Sq' = getTrainPerf(fit.xgb_da_dis)$TrainRsquared,
'80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_dis_8020)$TrainRsquared,
'80/20 Test R-Sq' = test_stat[[2]]))
xgb_dis_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Disaster Only XGBoost') +
theme_bw()
xgb_dis_obs_vs_pred_plot_zoom_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 10), ylim = c(0, 10)) +
theme_bw()
validation_result_8020 <- rbind(validation_result_8020,
tibble(Model = 'Disaster Only',
'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$coefficients[2, 1],
'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$r.squared))
fit.xgb_da_dis_8020$finalModel
## ##### xgb.Booster
## raw: 209.9 Kb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 31
## niter: 100
## nfeatures : 31
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 205 100 5 0.3 0.01 0.5 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_dis_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_dis_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_dis_8020$finalModel, top_n = 20)

XGBoost Full
Daily average model
train <- daily_full[complete.cases(daily_full),]
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_full <- readRDS('rfiles/fit.xgb_da_full.rds')
# fit.xgb_da_full <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full, 'rfiles/fit.xgb_da_full.rds')
getTrainPerf(fit.xgb_da_full)
fit.xgb_da_full$finalModel
## ##### xgb.Booster
## Handle is invalid! Suggest using xgb.Booster.complete
## raw: 1.2 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.3", max_depth = "5", gamma = "0.01", colsample_bytree = "0.5", min_child_weight = "0", subsample = "0.75", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## # of features: 40
## niter: 500
## nfeatures : 40
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 204 500 5 0.3 0.01 0.5 0 0.75
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full,scale=FALSE)
plot(imp, top = 20)

Get model performance at each fold
obs_xgb_everything <- c()
pred_xgb_everything <- c()
r2_test_xgb_everything <- c()
for (fold in seq(1, 10)) {
test_fold <- train[fit.xgb_da_full$control$indexOut[[fold]], ]
test_pred <- predict(fit.xgb_da_full$finalModel,
newdata = test_fold %>% select(-H2S_daily_avg) %>% as.matrix())
test_r2 <- postResample(test_pred, test_fold %>% pull(H2S_daily_avg))[[2]]
obs_xgb_everything <- append(obs_xgb_everything, test_fold %>% pull(H2S_daily_avg))
pred_xgb_everything <- append(pred_xgb_everything, test_pred)
r2_test_xgb_everything <- append(r2_test_xgb_everything, test_r2)
}
xgb_everything_obs_vs_pred_plot <- ggplot(tibble(obs = obs_xgb_everything, pred = pred_xgb_everything),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Observed vs Predicted for everything XGBoost') +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
xgb_everything_obs_vs_pred_plot_zoom <- ggplot(tibble(obs = obs_xgb_everything, pred = pred_xgb_everything),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x, geom = 'smooth') +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 30), ylim = c(0, 30)) +
theme_bw()
## Warning in geom_smooth(method = "lm", formula = y ~ x, geom = "smooth"):
## Ignoring unknown parameters: `geom`
validation_result <- rbind(validation_result,
tibble(Model = 'Everything w.o Disaster Indicator',
'Coef' = summary(lm(obs_xgb_everything ~
pred_xgb_everything))$coefficients[2, 1],
'R-Sq' = summary(lm(obs_xgb_everything ~
pred_xgb_everything))$r.squared))
SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_full$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_full$finalModel, top_n = 20)

80/20 CV
set.seed(313)
train <- rbind(
daily_full[complete.cases(daily_full),] %>%
filter(!(year == '2021' & month %in% c('10', '11', '12'))) %>%
slice_sample(prop = 0.8),
daily_full[complete.cases(daily_full),] %>%
filter(year == '2021', month %in% c('10', '11', '12')) %>%
slice_sample(prop = 0.8)
)
test <- anti_join(daily_full[complete.cases(daily_full),], train)
## Joining with `by = join_by(H2S_daily_max, H2S_daily_avg, H2S_monthly_average,
## monitor_lat, monitor_lon, mon_utm_x, mon_utm_y, Monitor, MinDist,
## Converted_Angle, Refinery, year, month, day, weekday, active_1km,
## monthly_oil_1km, monthly_gas_1km, dist_wrp, capacity, wrp_angle, dist_dc,
## dist_213, avg_temp, avg_hum, precip, ws_avg, wd_avg, daily_downwind_ref,
## daily_downwind_wrp, elevation, EVI, county, num_odor_complaints)`
train <- fastDummies::dummy_cols(train %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
test <- fastDummies::dummy_cols(test %>%
select(-c(Refinery, Monitor, day, H2S_daily_max, H2S_monthly_average,
monitor_lat, monitor_lon, county, dist_213, year)) %>%
mutate(MinDist = 1/(MinDist^2),
dist_wrp = 1/(dist_wrp^2)),
remove_selected_columns = TRUE)
fit.xgb_da_full_8020 <- readRDS('rfiles/fit.xgb_da_full_8020.rds')
# fit.xgb_da_full_8020 <- train(H2S_daily_avg~.,
# method = 'xgbTree',
# data = train,
# trControl=control,
# tuneGrid = tune_grid,
# tuneLength = 10, importance=TRUE, verbosity = 0, verbose=FALSE)
# saveRDS(fit.xgb_da_full_8020, 'rfiles/fit.xgb_da_full_8020.rds')
getTrainPerf(fit.xgb_da_full_8020)
# Test statistics
predictions <- predict(fit.xgb_da_full_8020$finalModel,
newdata = test %>% select(-H2S_daily_avg) %>% as.matrix())
test_stat <- postResample(predictions,
test %>% pull(H2S_daily_avg))
test_stat
## RMSE Rsquared MAE
## 2.4223982 0.2317491 0.4381407
xgb_result <- rbind(xgb_result,
tibble(Model = 'Everything w.o Disaster Indicator',
'R-Sq' = getTrainPerf(fit.xgb_da_full)$TrainRsquared,
'80/20 Train R-Sq' = getTrainPerf(fit.xgb_da_full_8020)$TrainRsquared,
'80/20 Test R-Sq' = test_stat[[2]]))
xgb_full_obs_vs_pred_plot_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Everything w.o Disaster Indicator XGBoost') +
theme_bw()
xgb_full_obs_vs_pred_plot_zoom_8020 <- ggplot(tibble(obs = test %>% pull(H2S_daily_avg), pred = predictions),
aes(x = pred, y = obs)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = 'red') +
geom_smooth(method = 'lm', formula = y ~ x) +
labs(y = 'Observed', x = 'Predicted',
title = 'Zoomed In') +
coord_cartesian(xlim = c(0, 10), ylim = c(0, 10)) +
theme_bw()
validation_result_8020 <- rbind(validation_result_8020,
tibble(Model = 'Everything',
'Coef' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$coefficients[2, 1],
'R-Sq' = summary(lm(test %>% pull(H2S_daily_avg) ~
predictions))$r.squared))
fit.xgb_da_full_8020$finalModel
## ##### xgb.Booster
## raw: 1.1 Mb
## call:
## xgboost::xgb.train(params = list(eta = param$eta, max_depth = param$max_depth,
## gamma = param$gamma, colsample_bytree = param$colsample_bytree,
## min_child_weight = param$min_child_weight, subsample = param$subsample),
## data = x, nrounds = param$nrounds, verbose = FALSE, objective = "reg:squarederror",
## importance = TRUE, verbosity = 0)
## params (as set within xgb.train):
## eta = "0.1", max_depth = "5", gamma = "0.001", colsample_bytree = "0.5", min_child_weight = "0", subsample = "1", objective = "reg:squarederror", importance = "TRUE", verbosity = "0", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## # of features: 40
## niter: 500
## nfeatures : 40
## xNames : mon_utm_x mon_utm_y MinDist Converted_Angle active_1km monthly_oil_1km monthly_gas_1km dist_wrp capacity wrp_angle dist_dc avg_temp avg_hum precip ws_avg wd_avg daily_downwind_ref daily_downwind_wrp elevation EVI num_odor_complaints month_01 month_02 month_03 month_04 month_05 month_06 month_07 month_08 month_09 month_10 month_11 month_12 weekday_Fri weekday_Mon weekday_Sat weekday_Sun weekday_Thu weekday_Tue weekday_Wed
## problemType : Regression
## tuneValue :
## nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
## 81 500 5 0.1 0.001 0.5 0 1
## obsLevels : NA
## param :
## $importance
## [1] TRUE
##
## $verbosity
## [1] 0
##
## $verbose
## [1] FALSE
imp<-varImp(fit.xgb_da_full_8020,scale=FALSE)
plot(imp, top = 20)

SHAP for XGBoost above (Caret)
matrix <- train %>% select(-H2S_daily_avg)
matrix <- as.matrix(sapply(matrix, as.numeric))
xgb.plot.shap(data = matrix,
model = fit.xgb_da_full_8020$finalModel, top_n = 12, n_col = 3)

xgb.ggplot.shap.summary(data = matrix,
model = fit.xgb_da_full_8020$finalModel, top_n = 20)

XGBoost Result
xgb_result
Observed Vs. Predicted Plots 10-fold CV
ggarrange(xgb_sincefeb2022_obs_vs_pred_plot,
ggarrange(xgb_disaster_obs_vs_pred_plot, xgb_disaster_obs_vs_pred_plot_zoom,
ncol = 2, labels = c("2", "3")),
ggarrange(xgb_everything_obs_vs_pred_plot, xgb_everything_obs_vs_pred_plot_zoom,
ncol = 2, labels = c("4", "5")),
labels = c("1"),
nrow = 3)

knitr::kable(validation_result, digits = 3)
| Since Feb 2022 |
1.026 |
0.989 |
| Disaster Only |
0.675 |
0.666 |
| Everything w.o Disaster Indicator |
1.000 |
1.000 |
Observed Vs. Predicted Plots 10-fold CV + 80/20 split
ggarrange(xgb_sincefeb2022_obs_vs_pred_plot_8020,
ggarrange(xgb_dis_obs_vs_pred_plot_8020, xgb_dis_obs_vs_pred_plot_zoom_8020,
ncol = 2, labels = c("2", "3")),
ggarrange(xgb_full_obs_vs_pred_plot_8020, xgb_full_obs_vs_pred_plot_zoom_8020,
ncol = 2, labels = c("4", "5")),
labels = c("1"),
nrow = 3)

knitr::kable(validation_result_8020)
| Since Feb 2022 |
1.0204725 |
0.7685410 |
| Disaster Only |
0.7062446 |
0.6423252 |
| Everything |
0.9326755 |
0.2317491 |
Extreme Value Models
EVGAM
# for this part, I will fit from Jan-2021 to April-2022
ev_data <- daily_full %>%
filter(day >= '2021-01-01', day < '2022-05-01') %>%
mutate(month = as.numeric(month),
weekday = as.character(weekday),
day = as.numeric(day),
dist_wrp = I(1/dist_wrp^2),
dist_dc = I(1/dist_dc^2),
mon_utm_x = I(mon_utm_x/10^3),
mon_utm_y = I(mon_utm_y/10^3))
daily_max_gev <- list(H2S_daily_max ~ s(month,bs='cc') + year,
~ s(month,bs='cc') + year +
weekday + wd_avg + ws_avg + daily_downwind_ref +
capacity + dist_wrp +
s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
te(mon_utm_x, mon_utm_y, day,
k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
monthly_oil_1km + monthly_gas_1km + active_1km +
daily_downwind_wrp + elevation + EVI + num_odor_complaints +
dist_dc + avg_temp + avg_hum + precip,
~ 1)
dm_gev <- readRDS('rfiles/dm_gev.rds')
# dm_gev <- evgam(daily_max_gev, ev_data, family = "gev")
# saveRDS(dm_gev, 'rfiles/dm_gev.rds')
summary(dm_gev)
##
## ** Parametric terms **
##
## location
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.39 0.02 57.56 <2e-16
## year2022 0.14 0.06 2.39 0.00846
##
## logscale
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.61 1.78 -3.720000e+00 9.93e-05
## year2022 0.49 0.14 3.560000e+00 0.000184
## weekdayMon 0.05 0.05 9.800000e-01 0.162
## weekdaySat -0.01 0.05 -1.900000e-01 0.424
## weekdaySun 0.00 0.05 -4.000000e-02 0.484
## weekdayThu 0.11 0.05 2.170000e+00 0.015
## weekdayTue 0.08 0.05 1.630000e+00 0.0516
## weekdayWed 0.03 0.05 5.100000e-01 0.303
## wd_avg 0.00 0.00 2.300000e+00 0.0108
## ws_avg 0.01 0.01 6.000000e-01 0.273
## daily_downwind_ref 0.07 0.05 1.400000e+00 0.0813
## capacity 0.00 0.00 -2.990000e+00 0.0014
## dist_wrp 45424.75 0.00 2.423312e+10 <2e-16
## monthly_oil_1km 0.00 0.00 7.040000e+00 9.96e-13
## monthly_gas_1km 0.00 0.00 -1.058000e+01 <2e-16
## active_1km 0.03 0.03 1.050000e+00 0.148
## daily_downwind_wrp -0.03 0.06 -5.200000e-01 0.303
## elevation -0.03 0.01 -3.160000e+00 0.000798
## EVI -0.46 0.15 -3.010000e+00 0.00131
## num_odor_complaints 0.05 0.01 8.660000e+00 <2e-16
## dist_dc 3098.92 0.00 2.100542e+07 <2e-16
## avg_temp 0.01 0.00 1.300000e+00 0.0962
## avg_hum 0.00 0.00 -1.930000e+00 0.0267
## precip -0.06 0.08 -7.400000e-01 0.23
##
## shape
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.21 0.02 11.44 <2e-16
##
## ** Smooth terms **
##
## location
## edf max.df Chi.sq Pr(>|t|)
## s(month) 6.2 8 375.69 <2e-16
##
## logscale
## edf max.df Chi.sq Pr(>|t|)
## s(month) 7.80 8 356.16 <2e-16
## s(mon_utm_x,mon_utm_y) 5.52 9 27.63 4.12e-05
## te(mon_utm_x,mon_utm_y,day) 54.02 80 290.64 <2e-16
plot(dm_gev)

MGCV
m1 <- readRDS('rfiles/m1.rds')
# m1 <- gam(list(H2S_daily_max ~ s(month,bs='cc') + year,
# ~ s(month,bs='cc') + year +
# weekday + wd_avg + ws_avg + daily_downwind_ref +
# capacity + dist_wrp +
# s(mon_utm_x, mon_utm_y, bs='tp', k = 10) +
# te(mon_utm_x, mon_utm_y, day,
# k=c(10,10),d=c(2,1),bs=c('tp','cc')) +
# monthly_oil_1km + monthly_gas_1km + active_1km +
# daily_downwind_wrp + elevation + EVI + num_odor_complaints +
# dist_dc + avg_temp + avg_hum + precip,
# ~ 1),
# data = ev_data, method = "REML",
# family = gevlss(link = list("identity", "identity", "identity")))
# saveRDS(m1, 'rfiles/m1.rds')
summary(m1)
##
## Family: gevlss
## Link function: identity identity identity
##
## Formula:
## H2S_daily_max ~ s(month, bs = "cc") + year
## ~s(month, bs = "cc") + year + weekday + wd_avg + ws_avg + daily_downwind_ref +
## capacity + dist_wrp + s(mon_utm_x, mon_utm_y, bs = "tp",
## k = 10) + te(mon_utm_x, mon_utm_y, day, k = c(10, 10), d = c(2,
## 1), bs = c("tp", "cc")) + monthly_oil_1km + monthly_gas_1km +
## active_1km + daily_downwind_wrp + elevation + EVI + num_odor_complaints +
## dist_dc + avg_temp + avg_hum + precip
## ~1
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.392e+00 2.397e-02 58.102 < 2e-16 ***
## year2022 1.361e-01 5.650e-02 2.408 0.016032 *
## (Intercept).1 -6.973e+00 1.758e+00 -3.966 7.30e-05 ***
## year2022.1 4.965e-01 1.393e-01 3.565 0.000364 ***
## weekdayMon.1 5.013e-02 5.205e-02 0.963 0.335492
## weekdaySat.1 -9.562e-03 4.864e-02 -0.197 0.844165
## weekdaySun.1 -3.141e-03 4.897e-02 -0.064 0.948862
## weekdayThu.1 1.112e-01 5.116e-02 2.173 0.029786 *
## weekdayTue.1 8.063e-02 5.002e-02 1.612 0.106935
## weekdayWed.1 2.570e-02 5.172e-02 0.497 0.619250
## wd_avg.1 4.441e-04 1.908e-04 2.328 0.019930 *
## ws_avg.1 5.815e-03 1.002e-02 0.580 0.561639
## daily_downwind_ref.1 6.469e-02 4.793e-02 1.350 0.177101
## capacity.1 -1.406e-03 5.051e-04 -2.785 0.005359 **
## dist_wrp.1 -3.640e+05 1.663e+06 -0.219 0.826789
## monthly_oil_1km.1 8.399e-04 1.148e-04 7.313 2.62e-13 ***
## monthly_gas_1km.1 -2.707e-03 2.570e-04 -10.533 < 2e-16 ***
## active_1km.1 2.400e-02 2.542e-02 0.944 0.345128
## daily_downwind_wrp.1 -3.218e-02 6.327e-02 -0.509 0.611053
## elevation.1 -3.246e-02 1.071e-02 -3.031 0.002436 **
## EVI.1 -4.050e-01 4.274e-01 -0.947 0.343434
## num_odor_complaints.1 5.175e-02 5.954e-03 8.692 < 2e-16 ***
## dist_dc.1 3.003e+03 5.827e+02 5.154 2.55e-07 ***
## avg_temp.1 6.181e-03 4.631e-03 1.335 0.182010
## avg_hum.1 -2.800e-03 1.439e-03 -1.946 0.051672 .
## precip.1 -5.810e-02 7.946e-02 -0.731 0.464715
## (Intercept).2 2.061e-01 1.778e-02 11.589 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(month) 5.946 8.000 531.90 < 2e-16 ***
## s.1(month) 7.743 8.000 372.15 < 2e-16 ***
## s.1(mon_utm_x,mon_utm_y) 4.972 5.554 22.42 0.00041 ***
## te.1(mon_utm_x,mon_utm_y,day) 53.079 80.000 380.71 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Deviance explained = NA%
## -REML = 7658.9 Scale est. = 1 n = 3904
plot(m1, pages = 1, scheme = 1, scale = 0, seWithMean = TRUE)
